In [82]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import chi2

import keras
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Activation, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.utils import pad_sequences

In [83]:
data = pd.read_csv("processed_data.csv") # shld be preprocessed data
data.head()

Unnamed: 0,title,text,subject,date,class,text_without_stopwords,title_without_stopwords,text_word_count,title_word_count,text_sentence_count,...,polarity,overall_content,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability,polarity_category,polarity_category_Neutral,polarity_category_Positive
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,donald trump wish americans happy new year lea...,donald trump sends out embarrassing new year’s...,516,13,28,...,0.082132,donald trump sends out embarrassing new year’s...,0.002194,0.747636,0.001007,0.15766,0.091503,Positive,0,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,house intelligence committee chairman devin nu...,drunk bragging trump staffer started russian c...,309,9,11,...,-0.005004,drunk bragging trump staffer started russian c...,0.064904,0.244962,0.557051,0.00232,0.130763,Neutral,1,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,on friday revealed former milwaukee sheriff da...,sheriff david clarke becomes an internet joke ...,600,16,25,...,-0.012345,sheriff david clarke becomes an internet joke ...,0.002488,0.433611,0.28146,0.001917,0.280524,Neutral,1,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,on christmas day donald trump announced would ...,trump is so obsessed he even has obama’s name ...,475,15,15,...,-0.023118,trump is so obsessed he even has obama’s name ...,0.002963,0.788261,0.204377,0.00229,0.002109,Neutral,1,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,pope francis used annual christmas day message...,pope francis just called out donald trump duri...,434,12,19,...,-0.011722,pope francis just called out donald trump duri...,0.292172,0.327938,0.001138,0.020911,0.357842,Neutral,1,0


In [84]:
# Observe which data types to use
dt = pd.DataFrame(data.dtypes).reset_index()
data_numerics = dt[dt[0].isin([np.dtype('int64'), np.dtype('float64')])]
data_text = dt[dt[0].isin([np.dtype('object')])]

In [85]:
dt

Unnamed: 0,index,0
0,title,object
1,text,object
2,subject,object
3,date,object
4,class,int64
5,text_without_stopwords,object
6,title_without_stopwords,object
7,text_word_count,int64
8,title_word_count,int64
9,text_sentence_count,int64


In [86]:
data_numerics

Unnamed: 0,index,0
4,class,int64
7,text_word_count,int64
8,title_word_count,int64
9,text_sentence_count,int64
10,title_sentence_count,int64
11,text_average_word_length,float64
12,title_average_word_length,float64
13,text_punctuation_count,int64
14,title_punctuation_count,int64
15,text_stopwords_count,int64


In [87]:
data_text

Unnamed: 0,index,0
0,title,object
1,text,object
2,subject,object
3,date,object
5,text_without_stopwords,object
6,title_without_stopwords,object
21,overall_content,object
27,polarity_category,object


### Logistic Regression

In [88]:
lr = LogisticRegression(random_state = 0)
scaler = MinMaxScaler()

In [89]:
# X, y split of training data
X_train, X_test, y_train, y_test = train_test_split(data[['polarity', 'subjectivity', 'flesch_readability']], data['class'], test_size = 0.2, random_state = 0)

In [90]:
scores, pvalues = chi2(abs(X_train), y_train)

In [91]:
com_dic = {'X2':X_train.columns, 'pvalues':pvalues}
result = pd.DataFrame(com_dic)
print(result)
# based on just 3 factors, subjectivity is more significant than polarity, need to investigate further

                   X2        pvalues
0            polarity   1.731023e-05
1        subjectivity   4.585605e-35
2  flesch_readability  4.701484e-267


In [92]:
# fit the model
#X_train = scaler.fit_transform(X_train)
lr_fit = lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred) * 100
recall = recall_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred)

results = pd.DataFrame({"Model": 'Logistic Regression',
                        "Accuracy (%)": [accuracy], 
                        "Precision (%)": [precision], 
                        "Recall (%)": [recall], 
                        "F1 Score": [f1]})
results

Unnamed: 0,Model,Accuracy (%),Precision (%),Recall (%),F1 Score
0,Logistic Regression,70.150026,69.03503,60.375723,0.644157


In [93]:
print(lr.coef_, lr.intercept_)

[[-0.83141044  7.49287713 -0.07257623]] [-3.1944284]


### CNN

In [94]:
# Our dictionary will contain only of the top 7000 words appearing most frequently
top_words = 7000

# Now we split our data-set into training and test data
X_train, X_test, y_train, y_test = train_test_split(data[['title_word_count']], data['class'], test_size = 0.2, random_state = 0)

# Looking at the nature of training data
print('Shape of training data: ')
print(X_train.shape)
print(y_train.shape)

print('Shape of test data: ')
print(X_test.shape)
print(y_test.shape)

# Padding the data samples to a maximum review length in words
max_words = 450

"""
--- Need to do word embedding to convert text to a list containing numeric data first ---

X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

# Building the CNN Model
model = Sequential()      # initilaizing the Sequential nature for CNN model

# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build()
model.summary()

# Fitting the data onto model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=2)

# Getting score metrics from our model
scores = model.evaluate(X_test, y_test, verbose=0)

# Displays the accuracy of correct sentiment prediction over test data
print("Accuracy: %.2f%%" % (scores[1]*100))

"""

Shape of training data: 
(30926, 1)
(30926,)
Shape of test data: 
(7732, 1)
(7732,)


'\n--- Need to do word embedding to convert text to a list containing numeric data first ---\n\nX_train = pad_sequences(X_train, maxlen=max_words)\nX_test = pad_sequences(X_test, maxlen=max_words)\n\n# Building the CNN Model\nmodel = Sequential()      # initilaizing the Sequential nature for CNN model\n\n# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary\nmodel.add(Embedding(top_words, 32, input_length=max_words))\nmodel.add(Conv1D(32, 3, padding=\'same\', activation=\'relu\'))\nmodel.add(MaxPooling1D())\nmodel.add(Flatten())\nmodel.add(Dense(250, activation=\'relu\'))\nmodel.add(Dense(1, activation=\'sigmoid\'))\n\nmodel.compile(loss=\'binary_crossentropy\', optimizer=\'adam\', metrics=[\'accuracy\'])\nmodel.build()\nmodel.summary()\n\n# Fitting the data onto model\nmodel.fit(X_train, y_train, epochs=2, batch_size=128, verbose=2)\n\n# Getting score metrics from our mo

In [95]:
# print(model.summary())

In [96]:
"""
# fit the model
cnn_pl = Pipeline([('standardize', scaler),
                    ('oversample', ros),
                    ('cnn', cnn_model)])

fit = cnn_pl.fit(X_train, y_train, epochs=20, batch_size=5, verbose=0)
y_pred = cnn_pl.predict(X_test)

accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred) * 100
recall = recall_score(y_test, y_pred) * 100
f1 = f1_score(y_test, y_pred)

results = pd.DataFrame({"Model": 'CNN',
                        "Accuracy (%)": [accuracy], 
                        "Precision (%)": [precision], 
                        "Recall (%)": [recall], 
                        "F1 Score": [f1]})
results
"""

'\n# fit the model\ncnn_pl = Pipeline([(\'standardize\', scaler),\n                    (\'oversample\', ros),\n                    (\'cnn\', cnn_model)])\n\nfit = cnn_pl.fit(X_train, y_train, epochs=20, batch_size=5, verbose=0)\ny_pred = cnn_pl.predict(X_test)\n\naccuracy = accuracy_score(y_test, y_pred) * 100\nprecision = precision_score(y_test, y_pred) * 100\nrecall = recall_score(y_test, y_pred) * 100\nf1 = f1_score(y_test, y_pred)\n\nresults = pd.DataFrame({"Model": \'CNN\',\n                        "Accuracy (%)": [accuracy], \n                        "Precision (%)": [precision], \n                        "Recall (%)": [recall], \n                        "F1 Score": [f1]})\nresults\n'

In [97]:
# evaluate the training and testing performance of your model 
# note: you should extract check both the loss function and your evaluation metric
#score = cnn_model.evaluate(X_train, y_train, verbose=0)
#print('Train loss:', score[0])
#print('Train accuracy:', score[1])