In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import altair as alt

# define the model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, cohen_kappa_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, average_precision_score

import plotly.express as px
import plotly.offline as py
import plotly.io as pio
import plotly.graph_objs as go 
pio.renderers.default = 'notebook_connected'
from xgboost import plot_importance
import category_encoders as ce

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import plotly.figure_factory as ff

Selecting only the columns required

In [106]:
pred=pd.read_csv('../data/fullDF.csv')

Setting Target Columns

In [107]:
pred['is_popular'] = np.where(pred['n_comments'].between(0, 300, inclusive=False), 'No','Yes')


Boolean inputs to the `inclusive` argument are deprecated infavour of `both` or `neither`.



In [108]:
pred=pred.drop(['subsection','headline', 'abstract','keywords','pub_date', 'n_comments', 'uniqueID', 'text','overall_sentiment','week_count'],axis=1)

In [124]:
pred.groupby(['is_popular']).size()

is_popular
0    12333
1    12333
dtype: int64

In [113]:
#target column
target_col = ["is_popular"]
#number of levels in feature to be a categorical feature
nlevels = 6

Setting which columns are categorical, numerical and binary to determine how ot encode them later

In [114]:
#target column
target_col = ["is_popular"]
#number of levels in feature to be a categorical feature
nlevels = 6

#categorical columns
cat_cols = list(set(pred.nunique()[pred.nunique()<nlevels].keys().tolist() 
                    + pred.select_dtypes(include='object').columns.tolist()))
cat_cols = [x for x in cat_cols if x not in target_col]

#Numerical columns
num_cols = [x for x in pred.columns if x not in cat_cols + target_col]


#Binary columns with 2 values
bin_cols = pred.nunique()[pred.nunique() == 2].keys().tolist()

#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]


Properly Encoding Columns as required

In [115]:
#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols:
    pred[i] = le.fit_transform(pred[i])

#Duplicating columns for multi value columns
pred = pd.get_dummies(data = pred, columns = multi_cols)

In [116]:
from imblearn.over_sampling import SMOTE
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='minority', random_state=42)
# Fit the model to generate the data.
oversampled_X, oversampled_Y = sm.fit_resample(pred.drop('is_popular', axis=1), pred['is_popular'])
pred = pd.concat([pd.DataFrame(oversampled_Y), pd.DataFrame(oversampled_X)], axis=1)

In [117]:
# Saving data frame with upscaling
pred.to_csv('../data/avi_upscaled.csv', index=False)

Setting Train and Test Data

In [118]:
#Separating the train and test datasets
train_data = pred[:13429]
test_data = pred[13429:]

Writing a generic model method

In [119]:
y_test_data=test_data.drop('is_popular',axis=1)
y_labels=test_data['is_popular']

In [120]:
def asd_prediction(algorithm, training_x, testing_x, training_y, testing_y, cf, threshold_plot):
    #model
    algorithm.fit(training_x, training_y)
    predictions = algorithm.predict(testing_x)
    #probabilities = algorithm.predict_proba(testing_x)
        
    #print('Algorithm:', type(algorithm)._name_)
    print("Accuracy Score:", accuracy_score(testing_y, predictions))
    
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y, predictions)
    print("Confusion Matrix:",conf_matrix)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y, predictions) 
    print("Area under curve:", model_roc_auc,"\n")

In [121]:
#defining the studied or used independent features (columns) as well the target  
cols = [i for i in train_data.columns if i not in target_col]

#splitting the principal training dataset to subtrain and subtest datasets
x_train, x_test, y_train, y_test = train_test_split(train_data[cols], train_data[target_col], 
                                                    test_size = .25, random_state = 2464)

Doing Cross Validation Now

In [123]:
# define models and parameters
model_logit = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model_logit, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best: 0.822692 using {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.820011 (0.008788) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.815180 (0.008988) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.819879 (0.008482) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.820706 (0.008677) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.815774 (0.009901) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.820773 (0.008675) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.820740 (0.008725) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.815610 (0.009658) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.821236 (0.008918) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.822659 (0.008996) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.815311 (0.011067) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.822692 (0.009503) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.820640 (0.008325) wit


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [105]:
from sklearn.linear_model import LogisticRegression

#Baseline model        
logit = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
                           penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
                           verbose=0, warm_start=False)

asd_prediction(logit, x_train, y_test_data, y_train, y_labels, "coefficients", threshold_plot=True)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Accuracy Score: 0.7995830851697439
Confusion Matrix: [[ 730  436]
 [ 237 1955]]
Area under curve: 0.758975801605088 



In [85]:
from sklearn.model_selection import GridSearchCV
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=cv)
clf.fit(x_train, y_train)
print("Best parameters: {}".format(clf.best_params_))
print("Best cross-validation score: {:.2f}".format(clf.best_score_))

Best parameters: {'criterion': 'entropy', 'max_depth': 6}
Best cross-validation score: 0.79


In [89]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(max_depth = 6, random_state = 223,
                                       splitter = "best", criterion = "entropy")

asd_prediction(decision_tree,x_train, y_test_data, y_train, y_labels, "features", threshold_plot=True)

Accuracy Score: 0.7900536033353186
Confusion Matrix: [[ 692  474]
 [ 231 1961]]
Area under curve: 0.7440493890147863 



In [None]:
from sklearn.model_selection import GridSearchCV
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
clf = GridSearchCV(RandomForestClassifier(), tree_para, cv=cv)
clf.fit(x_train, y_train)
print("Best parameters: {}".format(clf.best_params_))
print("Best cross-validation score: {:.2f}".format(clf.best_score_))

In [104]:
from sklearn.ensemble import RandomForestClassifier

decision_tree = RandomForestClassifier(max_depth = 20, random_state = 223, criterion = "entropy")

asd_prediction(decision_tree, x_train, y_test_data, y_train, y_labels ,"features", threshold_plot=True)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Accuracy Score: 0.800774270399047
Confusion Matrix: [[ 684  482]
 [ 187 2005]]
Area under curve: 0.7506553536327328 



In [None]:
asd_prediction(decision_tree, x_train, x_test, y_train, y_test, "features", threshold_plot=True)