In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

In [2]:
data = pd.read_csv("train_set.zip", compression="zip")
len(data)

251342

In [3]:
data.feature_2

0        -0.502605
1         0.520651
2        -0.781674
3        -0.688651
4        -0.688651
            ...   
251337   -0.874698
251338   -0.316558
251339   -0.409581
251340    1.171814
251341    0.334605
Name: feature_2, Length: 251342, dtype: float64

In [4]:
data1 = data[data['target']==1]
data0 = data[data['target']==0]

In [5]:
data_null = data[data['target'].isna()]
data_non_null = data1.append(data0)

In [6]:
X, y = data_non_null.iloc[:, :-1], data_non_null.iloc[:, -1]

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier

feature_names = [f'feature {i}' for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [9]:
import time
import numpy as np

start_time = time.time()
importances = forest.feature_importances_
std = np.std([
    tree.feature_importances_ for tree in forest.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")

Elapsed time to compute the importances: 0.046 seconds


In [10]:
forest_importances = pd.Series(importances, index=feature_names)
forest_importances

feature 0      0.004759
feature 1      0.000283
feature 2      0.002718
feature 3      0.003458
feature 4      0.003647
                 ...   
feature 456    0.000007
feature 457    0.001671
feature 458    0.000020
feature 459    0.001231
feature 460    0.001581
Length: 461, dtype: float64

In [11]:
sorted_features = forest_importances.sort_values(ascending=False)

In [12]:
sorted_features = pd.DataFrame(sorted_features)

In [13]:
sorted_features.head(60)

Unnamed: 0,0
feature 18,0.022924
feature 16,0.022861
feature 14,0.016043
feature 11,0.015319
feature 17,0.013492
feature 89,0.012531
feature 12,0.011549
feature 92,0.011543
feature 397,0.010961
feature 13,0.010911


In [14]:
sorted_features.columns = [ 'rank']

In [15]:
# considero solo le n feature più importanti
mif2 = ['feature_18','feature_16' , 'feature_14',  'feature_11', 'feature_17', 'feature_89', 
        'feature_12', 'feature_92', 'feature_397', 'feature_13', 'feature_126', 'target']

In [16]:
data_non_null_mif = data_non_null[mif2]

In [17]:
data_non_null_mif

Unnamed: 0,feature_18,feature_16,feature_14,feature_11,feature_17,feature_89,feature_12,feature_92,feature_397,feature_13,feature_126,target
38,1.678269,-0.780778,-0.678310,-0.099158,3.624938,-0.102744,2.355049,-0.074583,1,1.108069,-0.617708,1.0
358,2.186244,-1.347243,1.363995,0.793812,0.672473,1.945615,3.038361,-0.074764,0,-0.966967,0.662623,1.0
1024,1.160134,0.540974,-1.152416,-0.097242,3.624938,-0.083951,0.924649,-0.074764,1,-0.101593,1.916492,1.0
1224,-0.323153,-0.403134,1.473404,-0.102990,-0.311682,-0.102744,-0.337201,-0.074690,1,-1.087933,2.070528,1.0
1230,2.094809,1.107439,-1.371234,-0.102990,-0.311682,-0.102744,2.915365,-0.074764,0,0.121729,2.070528,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
251333,-0.373951,-1.158421,-0.158616,-0.102990,-0.311682,-0.102744,-0.405532,-0.074764,0,0.577679,-0.535578,0.0
251337,-0.333313,-1.158421,-0.751249,-0.102990,-0.311682,-0.102744,-0.350867,-0.074699,0,-1.069323,-0.582506,0.0
251339,-0.272356,0.163331,-0.705662,-0.095326,-0.311682,-0.102744,-0.268870,-0.074710,0,1.135985,-0.582506,0.0
251340,-0.363792,-1.158421,-0.368317,-0.089577,-0.311682,-0.096480,-0.391866,-0.074764,0,-0.548238,-0.582506,0.0


In [18]:
samples = data_non_null_mif.iloc[:, :-1].values
target = data_non_null_mif.iloc[:, 1].tolist()

# puoi decidere se usare o no il Normalizer.
normalizer = Normalizer()
model = KMeans(n_clusters=2, random_state=1)

pipe = make_pipeline(normalizer, model)
pipe.fit(samples)

Pipeline(steps=[('normalizer', Normalizer()),
                ('kmeans', KMeans(n_clusters=2, random_state=1))])

In [19]:
data_null = data_null[mif2]

In [20]:
new_labels = pipe.predict(data_null.iloc[:, :-1].values)

In [21]:
data_null.iloc[:, :-1]

Unnamed: 0,feature_18,feature_16,feature_14,feature_11,feature_17,feature_89,feature_12,feature_92,feature_397,feature_13,feature_126
0,0.408331,-0.780778,0.315491,-0.099158,-0.311682,-0.102744,0.646769,-0.074589,0,0.103119,-0.503666
1,-0.333313,-1.158421,-0.778601,-0.102990,-0.311682,-0.102744,-0.350867,-0.074702,0,1.219731,-0.503666
4,-0.231718,-0.403134,0.406665,-0.102990,-0.311682,-0.102744,-0.214205,-0.074764,0,-1.543883,-0.503666
8,-0.292675,0.163331,0.251668,-0.099158,-0.311682,-0.090215,-0.296202,-0.074764,0,0.158950,-0.503666
11,0.784232,-0.214313,1.509874,-0.097242,-0.311682,-0.102744,1.152420,-0.074616,0,-1.125153,-0.503666
...,...,...,...,...,...,...,...,...,...,...,...
251332,-0.292675,-0.214313,-0.505078,-0.091493,-0.311682,-0.102744,-0.296202,-0.074679,0,0.931273,-0.561170
251334,-0.343473,0.540974,-1.261825,-0.093409,-0.311682,-0.102744,-0.364533,-0.074669,1,1.712901,-0.535578
251335,0.103546,-0.969600,-0.040089,-0.102990,-0.311682,-0.102744,0.236781,-0.074715,0,0.735866,-0.583642
251336,-0.333313,0.163331,0.069320,-0.102990,0.672473,-0.102744,-0.350867,-0.074764,0,0.345052,-0.582506


In [22]:
new_labels.sum()

70085

In [23]:
new_lab_list = new_labels.tolist()
data_null['target'] = new_lab_list

In [24]:
data_null

Unnamed: 0,feature_18,feature_16,feature_14,feature_11,feature_17,feature_89,feature_12,feature_92,feature_397,feature_13,feature_126,target
0,0.408331,-0.780778,0.315491,-0.099158,-0.311682,-0.102744,0.646769,-0.074589,0,0.103119,-0.503666,0
1,-0.333313,-1.158421,-0.778601,-0.102990,-0.311682,-0.102744,-0.350867,-0.074702,0,1.219731,-0.503666,1
4,-0.231718,-0.403134,0.406665,-0.102990,-0.311682,-0.102744,-0.214205,-0.074764,0,-1.543883,-0.503666,0
8,-0.292675,0.163331,0.251668,-0.099158,-0.311682,-0.090215,-0.296202,-0.074764,0,0.158950,-0.503666,0
11,0.784232,-0.214313,1.509874,-0.097242,-0.311682,-0.102744,1.152420,-0.074616,0,-1.125153,-0.503666,0
...,...,...,...,...,...,...,...,...,...,...,...,...
251332,-0.292675,-0.214313,-0.505078,-0.091493,-0.311682,-0.102744,-0.296202,-0.074679,0,0.931273,-0.561170,1
251334,-0.343473,0.540974,-1.261825,-0.093409,-0.311682,-0.102744,-0.364533,-0.074669,1,1.712901,-0.535578,1
251335,0.103546,-0.969600,-0.040089,-0.102990,-0.311682,-0.102744,0.236781,-0.074715,0,0.735866,-0.583642,1
251336,-0.333313,0.163331,0.069320,-0.102990,0.672473,-0.102744,-0.350867,-0.074764,0,0.345052,-0.582506,1


In [25]:
final_data = data_non_null_mif.append(data_null)

In [26]:
final_data
final_data.shape

(251342, 12)

In [27]:
final_data[final_data['target']==1].shape

(73487, 12)

In [28]:
# usare un modello di regressione logistica pesato per fare classificazione
# i pesi sono da mettere in base alla rappresentanza delle classi
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

weights = {0:1.0, 1:5.0}
logreg = LogisticRegression( solver='lbfgs',class_weight=weights  )
scaler = StandardScaler()

X, y = final_data.iloc[:, :-1], final_data.iloc[:, -1]

X = scaler.fit_transform(X)

logreg.fit(X, y)

LogisticRegression(class_weight={0: 1.0, 1: 5.0})

In [29]:
test_data = pd.read_csv("test_set.zip", compression="zip")
len(test_data)

167562

In [30]:
test_data = test_data[mif2[:len(mif2)-1]]

In [31]:
test_data

Unnamed: 0,feature_18,feature_16,feature_14,feature_11,feature_17,feature_89,feature_12,feature_92,feature_397,feature_13,feature_126
0,-0.231718,0.352153,-0.176851,-0.102990,-0.311682,-0.102744,-0.214205,-0.074764,0,-0.194644,-0.589640
1,-0.312994,-0.969600,1.136059,-0.089577,-0.311682,-0.083951,-0.371366,-0.074119,1,-0.734340,-0.564428
2,-0.008209,-0.025491,0.233434,-0.102990,1.656628,-0.102744,0.077342,-0.074764,0,0.177560,-0.496207
3,0.123865,0.540974,-1.316530,-0.102990,1.656628,-0.102744,-0.018322,-0.074764,1,1.284866,-0.538612
4,-0.363792,0.918618,-1.179768,-0.091493,-0.311682,-0.096480,-0.391866,-0.074563,1,1.619850,-0.543882
...,...,...,...,...,...,...,...,...,...,...,...
167557,-0.373951,-1.347243,0.661953,-0.102990,-0.311682,-0.102744,-0.405532,-0.074710,0,-0.259780,-0.483002
167558,2.003373,0.918618,-1.225356,-0.101074,-0.311682,-0.096480,1.367663,-0.074538,1,-1.153069,1.916492
167559,-0.252037,0.729796,-0.833306,-0.093409,-0.311682,-0.102744,-0.241537,-0.074429,0,1.163900,-0.584239
167560,-0.282515,0.540974,1.354878,-0.093409,-0.311682,-0.077687,-0.282536,-0.074541,0,-1.236815,-0.503666


In [32]:
test_scaler = StandardScaler()
test_data = test_scaler.fit_transform(test_data)

In [33]:
predictions = logreg.predict(test_data)

In [34]:
probas = logreg.predict_proba(test_data)

In [35]:
pred_probas = probas[:, 1]
pred_prob = pred_probas.round(decimals=6, out=None)
DF = pd.DataFrame(pred_prob)


DF.to_csv("submissions/example_submission.csv",  header=None, index = False)

### Model XGBoost

In [36]:
from xgboost import XGBClassifier
model = XGBClassifier(scale_pos_weight = 6)
model.fit(X, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=6, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
predictions = model.predict(test_data)
probas2 = model.predict_proba(test_data)
pred_probas = probas2[:, 1]
pred_prob = pred_probas.round(decimals=6, out=None)

In [38]:
DF = pd.DataFrame(pred_prob)
DF.to_csv("submissions/example_submission.csv",  header=None, index = False)