In [4]:
# Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
import pickle


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:


def count_vectorizer(train_data, test_data, ngram):
    # train title data
    countVec_title = CountVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range=ngram,
                                     strip_accents='ascii')
    vector_train_title = countVec_title.fit_transform(train_data['clean_title'])
    tokens_title = countVec_title.get_feature_names()
    vectorized_train_title = pd.DataFrame(vector_train_title.toarray(), columns=tokens_title)

    # test title data - only transform
    vector_test_title = countVec_title.transform(test_data['clean_title'])
    vectorized_test_title = pd.DataFrame(vector_test_title.toarray(), columns=tokens_title)

    # train text data
    countVec_text = CountVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range=ngram,
                                    strip_accents='ascii')
    vector_train_text = countVec_text.fit_transform(train_data['clean_text'])
    tokens_text = countVec_text.get_feature_names()
    vectorized_train_text = pd.DataFrame(vector_train_text.toarray(), columns=tokens_text)

    # test text data - only transform
    vector_test_text = countVec_text.transform(test_data['clean_text'])
    vectorized_test_text = pd.DataFrame(vector_test_text.toarray(), columns=tokens_text)

    # combine train data and test data features
    vectorized_train = pd.concat([vectorized_train_title, vectorized_train_text], axis=1)
    vectorized_test = pd.concat([vectorized_test_title, vectorized_test_text], axis=1)
    return vectorized_train, vectorized_test


def tfidf_vectorizer(train_data, test_data, ngram):
    # train title data
    tfidfVec_title = TfidfVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range=ngram,
                                     strip_accents='ascii')
    vector_train_title = tfidfVec_title.fit_transform(train_data['clean_title'])
    tokens_title = tfidfVec_title.get_feature_names()
    vectorized_train_title = pd.DataFrame(vector_train_title.toarray(), columns=tokens_title)

    # test title data - only transform
    vector_test_title = tfidfVec_title.transform(test_data['clean_title'])
    vectorized_test_title = pd.DataFrame(vector_test_title.toarray(), columns=tokens_title)

    # train text data
    tfidfVec_text = TfidfVectorizer(lowercase=True, stop_words='english', min_df=0.01, ngram_range=ngram,
                                    strip_accents='ascii')
    vector_train_text = tfidfVec_text.fit_transform(train_data['clean_text'])
    tokens_text = tfidfVec_text.get_feature_names()
    vectorized_train_text = pd.DataFrame(vector_train_text.toarray(), columns=tokens_text)

    # test text data - only transform
    vector_test_text = tfidfVec_text.transform(test_data['clean_text'])
    vectorized_test_text = pd.DataFrame(vector_test_text.toarray(), columns=tokens_text)

    # combine train data and test data features
    vectorized_train = pd.concat([vectorized_train_title, vectorized_train_text], axis=1)
    vectorized_test = pd.concat([vectorized_test_title, vectorized_test_text], axis=1)
    return vectorized_train, vectorized_test


if __name__ == '__main__':
    # read the dataset
    data = pd.read_csv('newData_w_title.csv')

    X = pd.DataFrame(data[['title', 'text']])
    y = pd.DataFrame(data['label'])

    #######################################
    ############TEXT CLEANING##############
    #####APPLIED TO BOTH TRAIN AND TEST####
    #######################################
    # remove digits
    # remove words less than 3 characters
    # remove punctuation

    X['clean_title'] = X['title'].str.replace('\d+', ' ')  # for digits
    X['clean_title'] = X['clean_title'].str.replace(r'(\b\w{1,2}\b)', ' ')  # for words less than 3 characters
    X['clean_title'] = X['clean_title'].str.replace('[^\w\s]', ' ')  # for punctuation

    X['clean_text'] = X['text'].str.replace('\d+', ' ')  # for digits
    X['clean_text'] = X['clean_text'].str.replace(r'(\b\w{1,2}\b)', ' ')  # for words less than 3 characters
    X['clean_text'] = X['clean_text'].str.replace('[^\w\s]', ' ')  # for punctuation

    # lemmatization
    X['clean_title'] = X['clean_title'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    X['clean_text'] = X['clean_text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    # Split to train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1000)

    # Reset all the index
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    print(X_train.head())
    print(y_train['label'].value_counts())
    print(y_test['label'].value_counts())

    # Count Vectorize - Unigram
    ngram = (1, 1)
    train_data_count_uni, test_data_count_uni = count_vectorizer(X_train, X_test, ngram)

    print(train_data_count_uni)
    print(test_data_count_uni)

    # Count Vectorize - Bigram
    ngram = (1, 2)
    train_data_count_bi, test_data_count_bi = count_vectorizer(X_train, X_test, ngram)

    print(train_data_count_bi)
    print(test_data_count_bi)

    # Count Vectorize - Trigram
    ngram = (1, 3)
    train_data_count_tri, test_data_count_tri = count_vectorizer(X_train, X_test, ngram)

    print(train_data_count_tri)
    print(test_data_count_tri)

    # Tfidf Vectorize - Unigram
    ngram = (1, 1)
    train_data_tfidf_uni, test_data_tfidf_uni = tfidf_vectorizer(X_train, X_test, ngram)

    print(train_data_tfidf_uni)
    print(test_data_tfidf_uni)

    # Tfidf Vectorize - Bigram
    ngram = (1, 2)
    train_data_tfidf_bi, test_data_tfidf_bi = tfidf_vectorizer(X_train, X_test, ngram)

    print(train_data_tfidf_bi)
    print(test_data_tfidf_bi)

    # Tfidf Vectorize - Trigram
    ngram = (1, 3)
    train_data_tfidf_tri, test_data_tfidf_tri = tfidf_vectorizer(X_train, X_test, ngram)

    print(train_data_tfidf_tri)
    print(test_data_tfidf_tri)

    with open('vectorizedData-separated.pkl', 'wb') as f:
        obj = (train_data_count_uni,
               test_data_count_uni,
               train_data_count_bi,
               test_data_count_bi,
               train_data_count_tri,
               test_data_count_tri,
               train_data_tfidf_uni,
               test_data_tfidf_uni,
               train_data_tfidf_bi,
               test_data_tfidf_bi,
               train_data_tfidf_tri,
               test_data_tfidf_tri,
               y_train,
               y_test)
        pickle.dump(obj, f)



                                               title  \
0  Goldman Sachs Endorses Hillary Clinton For Pre...   
1  1 World Trade Center Gains Popularity in the P...   
2            Remove All Obstacles To Source Of Light   
3  According to this college prof, canoes are sym...   
4  Breaking: Emergency Call to Action at Standing...   

                                                text  \
0  Well finally we have the big global Business/B...   
1  You glance toward Lower Manhattan and expect t...   
2  Leave a reply \nAA Gabriel – When you pray to ...   
3  Print \nYou know, canoes? Those boats that you...   
4  Home / Badge Abuse / Breaking: Emergency Call ...   

                                         clean_title  \
0  Goldman Sachs Endorses Hillary Clinton For Pre...   
1  World Trade Center Gains Popularity the Panthe...   
2                  Remove All Obstacles Source Light   
3  According this college prof canoe are symbol c...   
4  Breaking Emergency Call Action Standing Roc

       america  american  anti  attack  black  breaking  breitbart  campaign  \
0            0         0     0       0      0         0          0         0   
1            0         0     0       0      0         0          0         0   
2            0         0     0       0      0         0          0         0   
3            0         0     0       0      0         0          0         0   
4            0         0     0       1      0         1          0         0   
...        ...       ...   ...     ...    ...       ...        ...       ...   
23995        0         0     0       0      0         0          0         0   
23996        0         0     0       0      0         0          0         0   
23997        0         0     0       0      0         0          0         0   
23998        0         0     0       0      0         0          0         0   
23999        0         0     0       0      0         0          0         0   

       china  clinton  ...  york  york 

       america  american  anti    attack  black  breaking  breitbart  \
0          0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
1          0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
2          0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
3          0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
4          0.0       0.0   0.0  0.583554    0.0  0.588866        0.0   
...        ...       ...   ...       ...    ...       ...        ...   
23995      0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
23996      0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
23997      0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
23998      0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   
23999      0.0       0.0   0.0  0.000000    0.0  0.000000        0.0   

       campaign  china   clinton  ...      york  york city  york times  \
0           0.0    0.0  0.445534  ...  0.000000   0.000000   

In [7]:
vector_data= pd.read_pickle("vectorizedData-separated.pkl")

In [8]:
vector_data[4]

Unnamed: 0,america,american,anti,attack,black,breaking,breitbart,campaign,china,clinton,...,york,york city,york times,young,young people,younger,youth,youtube,zero,zone
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,4,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
23996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23998,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Trigram - countvector best model 
train_data = vector_data[4] 
test_data = vector_data[5]

#Bigram - tfidf best model
train_data_tfidf = vector_data[8]
test_data_tfidf = vector_data[9]

In [10]:
test_data

Unnamed: 0,america,american,anti,attack,black,breaking,breitbart,campaign,china,clinton,...,york,york city,york times,young,young people,younger,youth,youtube,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y_train = vector_data[12]
y_test = vector_data[13]

In [12]:
#Libraries - For Machine learning model 
"""
import pandas as pd
from sklearn.model_selection import train_test_split
import string 
import re
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score


In [13]:
def evaluation_matrix(y_test, y_pred):
    #y_pred = y_pred.astype(int)
    #y_test = y_test.astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f_score, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    return accuracy, precision, recall, f_score    

def conv_dataframe_to_array(y_train, y_test):
    #Convert dataframe to matrix
    conv_y_train= y_train.values
    y_train = conv_y_train.ravel()
    conv_y_test= y_test.values
    y_test = conv_y_test.ravel()
    return y_train, y_test


In [14]:
y_train, y_test = conv_dataframe_to_array(y_train, y_test)

In [None]:
###########Optimized Model##################
###########DO NOT RUN#######################

from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


# Use the random grid to search for best hyperparameters - Count Vector
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_data, y_train)
rf_random.best_params_


#Random Forest Optimized- CountVec


best_random_countvec = rf.random.best_estimator_
y_pred_train_best = best_random_countvec.predict(train_data)


accuracy_train, precision_train, recall_train, f_score_train = evaluation_matrix(y_train, y_pred_train_best)

print('CountVec Train Best')
print('accuracy= ', accuracy_train)
print('precision= ', precision_train)
print('recall= ', recall_train)
print('f_score= ', f_score_train)

y_pred_best = best_random_countvec.predict(test_data)
accuracy, precision, recall, f_score = evaluation_matrix(y_test, y_pred_best)

print('CountVec Test Best')
print('accuracy= ', accuracy)
print('precision= ', precision)
print('recall= ', recall)
print('f_score= ', f_score)


In [15]:
###########With Optimized parameter results##################

#Random Forest - CountVec
classifier = RandomForestClassifier(n_estimators=157, random_state=42, max_depth = 70, n_jobs = -1, min_samples_split = 5, min_samples_leaf = 1, max_features = 'sqrt', bootstrap = False)
classifier.fit(train_data, y_train)
y_pred_train = classifier.predict(train_data)


accuracy_train, precision_train, recall_train, f_score_train = evaluation_matrix(y_train, y_pred_train)

print('CountVec Train')
print('accuracy= ', accuracy_train)
print('precision= ', precision_train)
print('recall= ', recall_train)
print('f_score= ', f_score_train)

y_pred = classifier.predict(test_data)
accuracy, precision, recall, f_score = evaluation_matrix(y_test, y_pred)

print('CountVec Test')
print('accuracy= ', accuracy)
print('precision= ', precision)
print('recall= ', recall)
print('f_score= ', f_score)



CountVec Train
accuracy=  0.999375
precision=  0.9993667325148159
recall=  0.9993835523468151
f_score=  0.9993748489989
CountVec Test
accuracy=  0.9588333333333333
precision=  0.9590113209384179
recall=  0.9588917195866905
f_score=  0.9588319324754799


In [16]:
#Random Forest - Tfidf
classifier_tfidf = RandomForestClassifier(n_estimators=157, random_state=42, max_depth = 70, n_jobs = -1, min_samples_split = 5, min_samples_leaf = 1, max_features = 'sqrt', bootstrap = False)
classifier_tfidf.fit(train_data_tfidf, y_train)

y_pred_tfidf_train = classifier_tfidf.predict(train_data_tfidf)
accuracy_tfidf_train, precision_tfidf_train, recall_tfidf_train, f_score_tfidf_train = evaluation_matrix(y_train, y_pred_tfidf_train)

print('Tfidf train')
print('accuracy= ', accuracy_tfidf_train)
print('precision= ', precision_tfidf_train)
print('recall= ', recall_tfidf_train)
print('f_score= ', f_score_tfidf_train)


y_pred_tfidf = classifier_tfidf.predict(test_data_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f_score_tfidf = evaluation_matrix(y_test, y_pred_tfidf)

print('Tfidf test')
print('accuracy= ', accuracy_tfidf)
print('precision= ', precision_tfidf)
print('recall= ', recall_tfidf)
print('f_score= ', f_score_tfidf)

Tfidf train
accuracy=  0.9995833333333334
precision=  0.9995768807649996
recall=  0.9995899286475847
f_score=  0.9995832310397074
Tfidf test
accuracy=  0.9555
precision=  0.9555931223972152
recall=  0.9555440688090684
f_score=  0.9554995537594142


In [22]:
test_data

Unnamed: 0,america,american,anti,attack,black,breaking,breitbart,campaign,china,clinton,...,york,york city,york times,young,young people,younger,youth,youtube,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# save the model to disk
filename = 'finalized_model_countvec_tri.sav'
pickle.dump(classifier, open(filename, 'wb'))
 


In [18]:
# save the model to disk
filename_tfidf = 'finalized_model_tfidf_bigram.sav'
pickle.dump(classifier_tfidf, open(filename_tfidf, 'wb'))


In [19]:
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict(test_data)
accuracy, precision, recall, f_score = evaluation_matrix(y_test, y_pred)

print('CountVec Test')
print('accuracy= ', accuracy)
print('precision= ', precision)
print('recall= ', recall)
print('f_score= ', f_score)


CountVec Test
accuracy=  0.9588333333333333
precision=  0.9590113209384179
recall=  0.9588917195866905
f_score=  0.9588319324754799


In [22]:
# some time later...
 
# load the model from disk
loaded_model_tfidf = pickle.load(open(filename_tfidf, 'rb'))
y_pred_tfidf = loaded_model_tfidf.predict(test_data_tfidf)
accuracy_tfidf, precision_tfidf, recall_tfidf, f_score_tfidf = evaluation_matrix(y_test, y_pred_tfidf)

print('Tfidf test')
print('accuracy= ', accuracy_tfidf)
print('precision= ', precision_tfidf)
print('recall= ', recall_tfidf)
print('f_score= ', f_score_tfidf)

Tfidf test
accuracy=  0.9555
precision=  0.9555931223972152
recall=  0.9555440688090684
f_score=  0.9554995537594142


In [23]:
#y_pred = loaded_model.predict(test_data)

y_pred = loaded_model.predict_proba(test_data)
y_pred

array([[0.97341394, 0.02658606],
       [0.33319209, 0.66680791],
       [0.00650612, 0.99349388],
       ...,
       [0.15764331, 0.84235669],
       [0.05576638, 0.94423362],
       [0.08280255, 0.91719745]])