In [96]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_selection import chi2

In [97]:
data = pd.read_csv("../master_dataset/processed_data.csv") # shld be preprocessed data
data.head()

Unnamed: 0,title,text,class,text_without_stopwords,title_without_stopwords,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,...,polarity,overall_content,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability,polarity_category,polarity_category_Neutral,polarity_category_Positive
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,donald trump wish americans happy new year lea...,donald trump sends out embarrassing new year’s...,516,13,28,1,4.80404,...,0.082132,donald trump sends out embarrassing new year’s...,0.002194,0.747636,0.001007,0.15766,0.091503,Positive,0,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,house intelligence committee chairman devin nu...,drunk bragging trump staffer started russian c...,309,9,11,1,5.213115,...,-0.005004,drunk bragging trump staffer started russian c...,0.064904,0.244962,0.557051,0.00232,0.130763,Neutral,1,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1,on friday revealed former milwaukee sheriff da...,sheriff david clarke becomes an internet joke ...,600,16,25,1,5.168966,...,-0.012345,sheriff david clarke becomes an internet joke ...,0.002488,0.433611,0.28146,0.001917,0.280524,Neutral,1,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1,on christmas day donald trump announced would ...,trump is so obsessed he even has obama’s name ...,475,15,15,1,5.18018,...,-0.023118,trump is so obsessed he even has obama’s name ...,0.002963,0.788261,0.204377,0.00229,0.002109,Neutral,1,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1,pope francis used annual christmas day message...,pope francis just called out donald trump duri...,434,12,19,1,4.554762,...,-0.011722,pope francis just called out donald trump duri...,0.292172,0.327938,0.001138,0.020911,0.357842,Neutral,1,0


In [98]:
# Observe which data types to use
dt = pd.DataFrame(data.dtypes).reset_index()
data_numerics = dt[dt[0].isin([np.dtype('int64'), np.dtype('float64')])]
data_text = dt[dt[0].isin([np.dtype('object')])]

In [99]:
dt

Unnamed: 0,index,0
0,title,object
1,text,object
2,class,int64
3,text_without_stopwords,object
4,title_without_stopwords,object
5,text_word_count,int64
6,title_word_count,int64
7,text_sentence_count,int64
8,title_sentence_count,int64
9,text_average_word_length,float64


In [100]:
data_numerics

Unnamed: 0,index,0
2,class,int64
5,text_word_count,int64
6,title_word_count,int64
7,text_sentence_count,int64
8,title_sentence_count,int64
9,text_average_word_length,float64
10,title_average_word_length,float64
11,text_punctuation_count,int64
12,title_punctuation_count,int64
13,text_stopwords_count,int64


In [101]:
data_text

Unnamed: 0,index,0
0,title,object
1,text,object
3,text_without_stopwords,object
4,title_without_stopwords,object
19,overall_content,object
25,polarity_category,object


### Logistic Regression

In [102]:
lr = LogisticRegression(random_state = 4222)

In [103]:
# X, y split of training data
X_train, X_test, y_train, y_test = train_test_split(data[list(data_numerics['index'])[1:]], data['class'], test_size = 0.2, random_state = 4222)

#balance x_train with oversampling
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')

# OverSampling only works on Dataframe, but current x_train and y_train are series
X_train, y_train = oversample.fit_resample(X_train, y_train)

#check that train set is oversampled
y_train.value_counts()

1    16961
0    16961
Name: class, dtype: int64

In [104]:
scores, pvalues = chi2(abs(X_train), y_train)

In [105]:
# determine significance of numeric variables with chi-squared test
com_dic = {'X2':X_train.columns, 'pvalues':pvalues}
result = pd.DataFrame(com_dic).sort_values(['pvalues'])
cols = result[result['pvalues'] > 0]
cols

Unnamed: 0,X2,pvalues
11,flesch_readability,1.829082e-259
17,Topic 4 Probability,2.2832779999999998e-176
7,title_punctuation_count,1.335698e-88
18,Topic 5 Probability,5.319065e-57
2,text_sentence_count,1.922695e-50
12,subjectivity,7.445406e-40
3,title_sentence_count,1.309082e-30
5,title_average_word_length,4.206444e-15
19,polarity_category_Neutral,7.981961e-13
16,Topic 3 Probbility,6.126604e-07


In [106]:
cols = cols['X2'].to_list()

In [107]:
X_train[cols]

Unnamed: 0,flesch_readability,Topic 4 Probability,title_punctuation_count,Topic 5 Probability,text_sentence_count,subjectivity,title_sentence_count,title_average_word_length,polarity_category_Neutral,Topic 3 Probbility,polarity,text_average_word_length,polarity_category_Positive
0,0.391774,0.006012,0,0.041384,5,0.455000,1,6.166667,1,0.466760,0.025000,5.202381,0
1,-1.427085,0.634943,0,0.002526,3,0.510017,1,6.777778,1,0.001683,-0.030976,5.491597,0
2,0.104912,0.191493,0,0.001081,29,0.469371,1,6.090909,0,0.374188,0.108198,5.093447,1
3,0.251438,0.049426,0,0.925208,38,0.414244,1,6.300000,1,0.000721,0.002904,5.047305,0
4,-0.336504,0.001350,2,0.001237,18,0.266967,1,5.750000,1,0.874651,0.001941,5.341924,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33917,-16.066846,0.029802,3,0.637963,1,0.000000,1,5.176471,1,0.018443,0.000000,43.000000,0
33918,0.232578,0.200566,3,0.003612,7,0.423363,1,5.533333,0,0.216901,0.158333,4.987013,1
33919,0.441711,0.435759,0,0.018838,15,0.418056,1,5.333333,1,0.001092,-0.011979,4.712821,0
33920,0.120348,0.509370,0,0.002290,11,0.428238,1,5.142857,0,0.001535,0.064810,4.665730,1


In [108]:
# fit the model
lr_fit = lr.fit(X_train[cols], y_train)
y_pred = lr.predict(X_test[cols])

accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred) * 100
recall = recall_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred)

results = pd.DataFrame({"Model": 'Logistic Regression',
                        "Accuracy (%)": [accuracy], 
                        "Precision (%)": [precision], 
                        "Recall (%)": [recall], 
                        "F1 Score": [f1]})
results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy (%),Precision (%),Recall (%),F1 Score
0,Logistic Regression,74.198138,70.231681,74.549614,0.723263


In [109]:
print(lr.coef_, lr.intercept_)

[[-5.79626619e-01 -2.11377876e+00  5.20851657e-02  1.03402983e+00
  -1.37374056e-03  7.76134029e+00  2.28270297e+00 -3.64591980e-01
  -7.86645760e-02 -2.88150604e-01  4.63880759e-01 -1.95602913e-01
  -1.61566785e-01]] [-2.27108594]


In [110]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76      4235
           1       0.70      0.75      0.72      3497

    accuracy                           0.74      7732
   macro avg       0.74      0.74      0.74      7732
weighted avg       0.74      0.74      0.74      7732

