In [55]:
from numpy import linalg as LA
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import networkx as nx
import collections
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, log_loss
import warnings
from sklearn.svm import SVC
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from  sklearn.cluster import KMeans
from sklearn.metrics import roc_curve,auc,roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import  average_precision_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA


In [56]:
relations_features_df = pd.read_csv("../data/relations_features3.csv", delimiter = '\t', index_col="src")

  mask |= (ar1 == a)


In [57]:
userdata_df = pd.read_csv("../data/usersdata.csv", delimiter = '\t', names = ["ID", "Gender", "Time", "Age", "Label"], index_col = "ID")

In [58]:
userdata_df.head()

Unnamed: 0_level_0,Gender,Time,Age,Label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,M,0.9,30,0
2,F,1.0,20,0
3,M,0.1375,30,0
4,M,0.3875,20,0
5,M,0.0125,20,0


## Normalize data

In [59]:
df = userdata_df.merge(relations_features_df, how='inner', left_index=True, right_index=True)

In [60]:
df = pd.concat([df, pd.get_dummies(df['Gender'], prefix='Gender', drop_first=True)], axis=1).drop(['Gender'], axis=1)

In [61]:
graph_features_subset = df[["deg_tot", "deg_out", "u_neigh", "n_bidir", "w_out", "w_in", "Label"]]

In [62]:
# Assign weights to sample equal number of Spam and Ham
df["inv_value_count"] = 0
df["inv_value_count"][df.Label == 1] = 1/df["Label"].value_counts()[1]
df["inv_value_count"][df.Label == 0] = 1/df["Label"].value_counts()[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [63]:
# to sample equal number of spam/ham we add weight with inv_value_count columns
df_sample = df.sample(n = 50000, weights = 'inv_value_count').drop(["inv_value_count"], axis=1)

In [64]:
X, y = df.drop("Label", axis=1), df[["Label"]]
X, y = X.values, y.values

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [67]:
from sklearn.preprocessing import StandardScaler

In [69]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Random Forest

In [54]:
grid_params_rf = {
    'bootstrap': [True],
    'max_depth': [2, 5, 10],
    'n_estimators': [10, 20, 50]
}
gs_rf = GridSearchCV(RandomForestClassifier(), grid_params_rf, verbose = 1, cv = 5, n_jobs = -1)
gs_results_rf = gs_rf.fit(X_train, y_train)
print(gs_results_rf.best_score_)
print(gs_results_rf.best_estimator_)
print(gs_results_rf.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 20.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


1.0
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
{'bootstrap': True, 'max_depth': 5, 'n_estimators': 10}


In [None]:
## XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

In [None]:
grid_params_xgb = {
    'max_depth': [2, 5, 10],
    'n_estimators': [10, 20, 50, 100],
    'learning_rate': [0.1, 0.01, 0.05]
}

In [None]:
gs_xgb = GridSearchCV(xgb, grid_params_xgb, verbose = 1, cv = 3, n_jobs = -1)
gs_results_xgb = gs_xgb.fit(X, y)

In [None]:
print(gs_results_xgb.best_score_)
print(gs_results_xgb.best_estimator_)
print(gs_results_xgb.best_params_)