In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

In [2]:
data = pd.read_csv("train_set.zip", compression="zip")

## Capire i dati. In particolare si cerca di capire: 

- la dimensione del dataset e il tipo di dato per ogni variabile
- informazioni con rilevanza statistica
- se ci sono valori Nan

In [3]:
# dimensione dataset 
data.shape

(251342, 462)

In [4]:
# trova quale colonna nel dataset contiene Nans
data.isna().any()[lambda x: x]

target    True
dtype: bool

### Capire la numerosità delle classi 

In [5]:
data1 = data[data['target']==1]
data0 = data[data['target']==0]

In [6]:
# capire quante righe hanno come target Nan
data_null = data[data['target'].isna()]

# mettere insieme i dati buoni delle due classi
data_non_null = data1.append(data0)

## Come fare per fare data imputation per i label mancanti 

- Usare un algoritmo Kmeans che impara dai dati buoni
- Usa quello che ha imparato per fare inferenza sulle label mancanti
- Completare il dataset con il suggerimento del Kmeans

#### Qualche difficoltà con questo procedimento. 

- Ogni data point (o riga) nel dataset ha 460 dimensioni. Quindi si ha bisogno di ridurre le dimensioni per facilitare il lavoro del K-Means. 
- Per affrontare questo punto, si è usato un classificatore RandomForest che permette di estrarre l'importanza delle features così da poter usare solo le features più importanti per fare clustering. 

### Usare un classificatore RandomForest per capire quali sono le feature più importanti

In [7]:
X, y = data_non_null.iloc[:, :-1], data_non_null.iloc[:, -1]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestClassifier

feature_names = [f'feature {i}' for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [10]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)


In [11]:
forest_importances = pd.Series(importances, index=feature_names)

In [12]:
sorted_features = forest_importances.sort_values(ascending=False)
sorted_features = pd.DataFrame(sorted_features)
sorted_features.head(60)

Unnamed: 0,0
feature 18,0.022924
feature 16,0.022861
feature 14,0.016043
feature 11,0.015319
feature 17,0.013492
feature 89,0.012531
feature 12,0.011549
feature 92,0.011543
feature 397,0.010961
feature 13,0.010911


In [13]:
sorted_features.columns = [ 'rank']

In [14]:
# considero solo le n feature più importanti con n = 11
mif2 = ['feature_18','feature_16' , 'feature_14',  'feature_11', 'feature_17', 'feature_89', 
        'feature_12', 'feature_92', 'feature_397', 'feature_13', 'feature_126','target']

In [15]:
data_non_null_mif = data_non_null[mif2]

In [16]:
data_non_null_mif

Unnamed: 0,feature_18,feature_16,feature_14,feature_11,feature_17,feature_89,feature_12,feature_92,feature_397,feature_13,feature_126,target
38,1.678269,-0.780778,-0.678310,-0.099158,3.624938,-0.102744,2.355049,-0.074583,1,1.108069,-0.617708,1.0
358,2.186244,-1.347243,1.363995,0.793812,0.672473,1.945615,3.038361,-0.074764,0,-0.966967,0.662623,1.0
1024,1.160134,0.540974,-1.152416,-0.097242,3.624938,-0.083951,0.924649,-0.074764,1,-0.101593,1.916492,1.0
1224,-0.323153,-0.403134,1.473404,-0.102990,-0.311682,-0.102744,-0.337201,-0.074690,1,-1.087933,2.070528,1.0
1230,2.094809,1.107439,-1.371234,-0.102990,-0.311682,-0.102744,2.915365,-0.074764,0,0.121729,2.070528,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
251333,-0.373951,-1.158421,-0.158616,-0.102990,-0.311682,-0.102744,-0.405532,-0.074764,0,0.577679,-0.535578,0.0
251337,-0.333313,-1.158421,-0.751249,-0.102990,-0.311682,-0.102744,-0.350867,-0.074699,0,-1.069323,-0.582506,0.0
251339,-0.272356,0.163331,-0.705662,-0.095326,-0.311682,-0.102744,-0.268870,-0.074710,0,1.135985,-0.582506,0.0
251340,-0.363792,-1.158421,-0.368317,-0.089577,-0.311682,-0.096480,-0.391866,-0.074764,0,-0.548238,-0.582506,0.0


In [18]:
samples = data_non_null_mif.iloc[:, :-1].values
target = data_non_null_mif.iloc[:, 1].tolist()

model = KMeans(n_clusters=2, random_state=1)

pipe = make_pipeline(model)
pipe.fit(samples)

Pipeline(steps=[('kmeans', KMeans(n_clusters=2, random_state=1))])

In [19]:
data_null = data_null[mif2]

In [20]:
new_labels = pipe.predict(data_null.iloc[:, :-1].values)

In [21]:
data_null.iloc[:, :-1]

Unnamed: 0,feature_18,feature_16,feature_14,feature_11,feature_17,feature_89,feature_12,feature_92,feature_397,feature_13,feature_126
0,0.408331,-0.780778,0.315491,-0.099158,-0.311682,-0.102744,0.646769,-0.074589,0,0.103119,-0.503666
1,-0.333313,-1.158421,-0.778601,-0.102990,-0.311682,-0.102744,-0.350867,-0.074702,0,1.219731,-0.503666
4,-0.231718,-0.403134,0.406665,-0.102990,-0.311682,-0.102744,-0.214205,-0.074764,0,-1.543883,-0.503666
8,-0.292675,0.163331,0.251668,-0.099158,-0.311682,-0.090215,-0.296202,-0.074764,0,0.158950,-0.503666
11,0.784232,-0.214313,1.509874,-0.097242,-0.311682,-0.102744,1.152420,-0.074616,0,-1.125153,-0.503666
...,...,...,...,...,...,...,...,...,...,...,...
251332,-0.292675,-0.214313,-0.505078,-0.091493,-0.311682,-0.102744,-0.296202,-0.074679,0,0.931273,-0.561170
251334,-0.343473,0.540974,-1.261825,-0.093409,-0.311682,-0.102744,-0.364533,-0.074669,1,1.712901,-0.535578
251335,0.103546,-0.969600,-0.040089,-0.102990,-0.311682,-0.102744,0.236781,-0.074715,0,0.735866,-0.583642
251336,-0.333313,0.163331,0.069320,-0.102990,0.672473,-0.102744,-0.350867,-0.074764,0,0.345052,-0.582506


In [22]:
new_labels.sum()

948

In [23]:
new_lab_list = new_labels.tolist()
data_null['target'] = new_lab_list

In [24]:
data_null

Unnamed: 0,feature_18,feature_16,feature_14,feature_11,feature_17,feature_89,feature_12,feature_92,feature_397,feature_13,feature_126,target
0,0.408331,-0.780778,0.315491,-0.099158,-0.311682,-0.102744,0.646769,-0.074589,0,0.103119,-0.503666,0
1,-0.333313,-1.158421,-0.778601,-0.102990,-0.311682,-0.102744,-0.350867,-0.074702,0,1.219731,-0.503666,0
4,-0.231718,-0.403134,0.406665,-0.102990,-0.311682,-0.102744,-0.214205,-0.074764,0,-1.543883,-0.503666,0
8,-0.292675,0.163331,0.251668,-0.099158,-0.311682,-0.090215,-0.296202,-0.074764,0,0.158950,-0.503666,0
11,0.784232,-0.214313,1.509874,-0.097242,-0.311682,-0.102744,1.152420,-0.074616,0,-1.125153,-0.503666,0
...,...,...,...,...,...,...,...,...,...,...,...,...
251332,-0.292675,-0.214313,-0.505078,-0.091493,-0.311682,-0.102744,-0.296202,-0.074679,0,0.931273,-0.561170,0
251334,-0.343473,0.540974,-1.261825,-0.093409,-0.311682,-0.102744,-0.364533,-0.074669,1,1.712901,-0.535578,0
251335,0.103546,-0.969600,-0.040089,-0.102990,-0.311682,-0.102744,0.236781,-0.074715,0,0.735866,-0.583642,0
251336,-0.333313,0.163331,0.069320,-0.102990,0.672473,-0.102744,-0.350867,-0.074764,0,0.345052,-0.582506,0


In [25]:
final_data = data_non_null_mif.append(data_null)

In [26]:
final_data
final_data.shape

(251342, 12)

In [27]:
final_data[final_data['target']==1].shape

(4350, 12)

In [29]:
labels_2 = final_data.iloc[:, -1]
labels_2

38        1.0
358       1.0
1024      1.0
1224      1.0
1230      1.0
         ... 
251332    0.0
251334    0.0
251335    0.0
251336    0.0
251338    0.0
Name: target, Length: 251342, dtype: float64

In [30]:
data2 = data.iloc[:, :-1]
data2['target']= labels_2

In [33]:
data2.shape

(251342, 462)

In [31]:
test_data = pd.read_csv("test_set.zip", compression="zip")
len(test_data)

167562

In [34]:
X, y = data2.iloc[:, :-1], data2.iloc[:, -1]

In [35]:
test_data

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_454,feature_455,feature_456,feature_457,feature_458,feature_459,feature_460,feature_461,feature_462,feature_463
0,2.563981,-0.130512,-0.059120,-0.141570,-0.097845,-0.201220,-0.141820,-0.109972,-0.048769,-0.102990,...,1,0,0,0,0,0,1,0,0,0
1,2.779405,-0.130512,0.181206,-0.136476,-0.078306,-0.201220,-0.136630,0.420157,0.023708,-0.089577,...,1,0,0,0,0,0,0,0,1,0
2,-0.667385,-0.130512,0.181206,-0.141570,-0.097845,-0.317052,-0.141820,-0.109972,-0.048769,-0.102990,...,1,0,0,0,0,0,1,0,0,0
3,0.625162,-0.130512,-0.059120,1.962213,-0.097845,-0.085388,2.001775,-0.109972,-0.048769,-0.102990,...,1,0,0,0,0,0,1,0,0,0
4,-0.667385,-0.130512,-0.059120,-0.141570,-0.081097,-0.201220,-0.141820,0.243447,-0.040716,-0.091493,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167557,-0.667385,-0.130512,-0.059120,-0.141570,-0.097845,-0.201220,-0.141820,-0.051069,-0.048769,-0.102990,...,1,0,0,0,0,0,0,0,0,1
167558,-0.667385,-0.130512,-0.059120,-0.141570,-0.097845,-0.452190,-0.141820,-0.109972,-0.040716,-0.101074,...,1,0,0,0,0,0,1,0,0,0
167559,1.486859,-0.874698,-0.059120,-0.141570,-0.097845,-0.394274,-0.141820,0.184544,-0.008504,-0.093409,...,0,1,0,0,0,0,1,0,0,0
167560,-0.667385,-0.130512,-0.059120,-0.141570,-0.097845,-0.201220,-0.141820,0.184544,-0.008504,-0.093409,...,1,0,0,0,0,0,1,0,0,0


### Model XGBoost

In [37]:
from xgboost import XGBClassifier
model = XGBClassifier(scale_pos_weight = 57)
model.fit(X, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=6, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [38]:
predictions = model.predict(test_data)
probas2 = model.predict_proba(test_data)
pred_probas = probas2[:, 1]
pred_prob = pred_probas.round(decimals=6, out=None)

In [39]:
DF = pd.DataFrame(pred_prob)
DF.to_csv("submissions/example_submission.csv",  header=None, index = False)