In [1]:
import pickle 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
### Load Data ###
with open("pickles/titles", "rb") as file:
    titles = pickle.load(file)
with open("pickles/texts", "rb") as file:
    texts = pickle.load(file)
df = pd.read_csv("dataset/dataset.csv")
labels = df["Label"]
source = df["Company"]
print(titles)
print(texts)
print(labels)
print(source)

                                                 Title  \
0         How Trump's presidency has played for Russia   
1    Tokyo 2020 Olympics: Japanese PM Abe insists G...   
2       Coronavirus is Boris Johnson's worst nightmare   
3    She heard a woman yelling at the grocery store...   
4    Here's what could really sink the global econo...   
..                                                 ...   
897  Trump Ends North Korean Talks Early After Kim ...   
898  Doctors Warn Drinking A Shot Every Time Trump ...   
899  Trump Looking Forward To Knighthood From The Q...   
900  Nation Wonders How Much More Fucking Evidence ...   
901  President Trump Alleges He Was Sexually Assaul...   

                                                Tokens  \
0                  [trump, presidency, played, russia]   
1    [tokyo, olympics, japanese, pm, abe, insists, ...   
2        [coronavirus, boris, johnson, bad, nightmare]   
3    [heard, woman, yell, grocery, store, next, mov...   
4    [could, 

In [3]:
TitleString = titles["TitleString"]
TitleString

0                         trump presidency played russia
1      tokyo olympics japanese pm abe insists game go...
2                coronavirus boris johnson bad nightmare
3      heard woman yell grocery store next move drew ...
4      could really sink global economy trillion risk...
                             ...                        
897    trump end north korean talk early kim suggests...
898    doctor warn drinking shot everi time trump lie...
899                  trump look forward knighthood queen
900     nation wonder fuck evidence mueller need exactly
901    president trump alleges sexually assault hilla...
Name: TitleString, Length: 902, dtype: object

In [4]:
titleVector = TfidfVectorizer()
titleVector

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
titleVectorArray = titleVector.fit_transform(TitleString).toarray()

In [6]:
TextString = texts["TextString"]
TextString

0      need james bond recogn fact modern american pr...
1      cnn japanese prime minister shinzo abe insist ...
2      london cnn outbreak novel coronavirus caus pol...
3      chat us facebook messenger find happen world u...
4      london cnn business company spent year sinc gl...
                             ...                        
897    hanoi detail emerg suggest president trump wal...
898    viral drinking game involves consum shot everi...
899    london trump tweet excitement becom first amer...
900    accumul many assum damn evidence ever sit pres...
901    fussa president united state begun tour asia m...
Name: TextString, Length: 902, dtype: object

In [7]:
textVector = TfidfVectorizer()
textVectorArray = textVector.fit_transform(TextString).toarray()

In [8]:
print(textVectorArray)
print(labels)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0       True
1       True
2       True
3       True
4       True
       ...  
897    False
898    False
899    False
900    False
901    False
Name: Label, Length: 902, dtype: bool


In [8]:
#### Learning on Just Title ####
### SPLIT DATASET ###
title_train,title_test,label_train,label_test = train_test_split(titleVectorArray, labels, test_size=0.2, random_state=7)

In [9]:
## Fraction of true Labels in training Set
## print(list(label_train).count(True)/len(list(label_train)))
## Fraction of true Labels in Test Set
## print(list(label_test).count(True)/len(list(label_test)))

In [10]:
model = MultinomialNB()

In [11]:
model.fit(title_train, label_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
result = model.predict(title_test)

In [None]:
print("accuracy score: ", accuracy_score(label_test, result)  )

In [None]:
#### Learning on text #####
text_train, text_test, label_train, label_test = train_test_split(textVectorArray, labels, test_size=0.2, random_state=7)

In [None]:
model2 = MultinomialNB()
model2.fit(text_train, label_train)

In [None]:
result = model2.predict(text_test)

In [None]:
print("Accuracy: ", accuracy_score(label_test, result))

In [35]:
#### Splitting before vectorization ###
text_train, text_test, label_train, label_test = train_test_split(TextString, labels, test_size=0.2, random_state=7)

In [36]:
textVector = TfidfVectorizer()
textVectorArray = textVector.fit_transform(text_train).toarray()
textVectorArray

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
model = MultinomialNB()
model.fit(textVectorArray, label_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
testVectorArray = textVector.transform(text_test).toarray()
testVectorArray

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
result = model.predict(testVectorArray)
print("Accuracy: " , accuracy_score(label_test, result))

Accuracy:  0.8729281767955801


In [27]:
############

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
############# LEARNING ON BOTH FEATURES ############
VectorArray = np.concatenate((titleVectorArray, textVectorArray), axis = 1) 

In [14]:
VectorArray_train, VectorArray_test, label_train, label_test = train_test_split(VectorArray, labels, test_size=0.2, random_state=7)

In [86]:
l1=list(range(140,181))
for i in range(len(l1)):
    l1[i]=l1[i]*0.001
print(l1)

[0.14, 0.14100000000000001, 0.14200000000000002, 0.14300000000000002, 0.14400000000000002, 0.145, 0.146, 0.147, 0.148, 0.149, 0.15, 0.151, 0.152, 0.153, 0.154, 0.155, 0.156, 0.157, 0.158, 0.159, 0.16, 0.161, 0.162, 0.163, 0.164, 0.165, 0.166, 0.167, 0.168, 0.169, 0.17, 0.171, 0.17200000000000001, 0.17300000000000001, 0.17400000000000002, 0.17500000000000002, 0.176, 0.177, 0.178, 0.179, 0.18]


In [87]:
##GridSearch
param_grid=dict(alpha=l1)
print(param_grid)

{'alpha': [0.14, 0.14100000000000001, 0.14200000000000002, 0.14300000000000002, 0.14400000000000002, 0.145, 0.146, 0.147, 0.148, 0.149, 0.15, 0.151, 0.152, 0.153, 0.154, 0.155, 0.156, 0.157, 0.158, 0.159, 0.16, 0.161, 0.162, 0.163, 0.164, 0.165, 0.166, 0.167, 0.168, 0.169, 0.17, 0.171, 0.17200000000000001, 0.17300000000000001, 0.17400000000000002, 0.17500000000000002, 0.176, 0.177, 0.178, 0.179, 0.18]}


In [28]:
print(VectorArray_train)
print(VectorArray_train.shape)
print(label_train.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(721, 19735)
(721,)


In [88]:
nb=MultinomialNB()
grid = GridSearchCV(nb, param_grid, cv=10,return_train_score=False)
grid.fit(VectorArray_train,label_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.14, 0.14100000000000001, 0.14200000000000002, 0.14300000000000002, 0.14400000000000002, 0.145, 0.146, 0.147, 0.148, 0.149, 0.15, 0.151, 0.152, 0.153, 0.154, 0.155, 0.156, 0.157, 0.158, 0.159, 0.16, 0.161, 0.162, 0.163, 0.164, 0.165, 0.166, 0.167, 0.168, 0.169, 0.17, 0.171, 0.17200000000000001, 0.17300000000000001, 0.17400000000000002, 0.17500000000000002, 0.176, 0.177, 0.178, 0.179, 0.18]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [89]:
grid.cv_results_

{'mean_fit_time': array([0.18289816, 0.1776248 , 0.17778761, 0.17751832, 0.17697539,
        0.18094132, 0.1788348 , 0.17599354, 0.17612436, 0.1773385 ,
        0.18137074, 0.17811606, 0.17780075, 0.17919927, 0.1692522 ,
        0.19034219, 0.17956502, 0.18122633, 0.1725791 , 0.17358935,
        0.17171221, 0.17709191, 0.16923482, 0.17282295, 0.17331507,
        0.18079774, 0.17471702, 0.17004294, 0.18157594, 0.18147161,
        0.18010457, 0.17785928, 0.17213347, 0.17678635, 0.18269358,
        0.17580664, 0.17607808, 0.17780609, 0.18186433, 0.17772245,
        0.1743567 ]),
 'std_fit_time': array([0.01904377, 0.00866213, 0.01029474, 0.01252896, 0.00634623,
        0.00646704, 0.00993986, 0.00558153, 0.00604343, 0.01251541,
        0.01357419, 0.0134805 , 0.00671446, 0.01179092, 0.01138041,
        0.02253362, 0.01329728, 0.01606442, 0.017877  , 0.00359371,
        0.00624445, 0.00758744, 0.01024447, 0.0084632 , 0.00474678,
        0.01053415, 0.01455977, 0.01426664, 0.00559577, 0.012

In [90]:
grid.best_score_

0.897364771151179

In [91]:
grid.best_params_

{'alpha': 0.14}

In [92]:
modelx = MultinomialNB(alpha=0.14)
modelx.fit(VectorArray_train, label_train)

MultinomialNB(alpha=0.14, class_prior=None, fit_prior=True)

In [93]:
accuracy = modelx.predict(VectorArray_test)

In [94]:
print("Accuracy: ", accuracy_score(label_test, accuracy))

Accuracy:  0.9281767955801105


In [33]:
################### Learning on Just source ###########
source_train, source_train, label_train, label_test = train_test_split(source, label, test_size=0.2, random_state=7)
print(len(source_train), len(label_train))

721 721


In [34]:
modely = MultinomialNB()
modely.fit(source_train, label_train)
result = modely.predict(label_test)

ValueError: could not convert string to float: 'realnewsrightnow'

In [27]:
def l2s(L):
    return " ".join(L)
ti = ""
s = l2s(clean(nltk.word_tokenize(ti)))
tiVector = titleVector.transform([s]).toarray()
new = "Take a deep breath and hold your breath for more than 10 seconds. If you complete it successfully without coughing, without discomfort stiffness or tightness, etc., it proves there is no Fibrosis in the lungs, basically indicates no infection.In critical time, please self-check every morning in an environment with clean air"
newVector= textVector.transform([l2s(clean(nltk.word_tokenize(new)))]).toarray()

vec = np.concatenate((tiVector,newVector), axis=1)
print(vec.shape)
print(vec.shape)

(1, 19735)
(1, 19735)


In [28]:
print(modelx.predict(vec))

[ True]


In [16]:
###############################################################################################V

VectorArray = np.concatenate((titleVectorArray, textVectorArray), axis=1)
VectorArray.shape

(902, 19735)

In [17]:
VA_train, VA_test, label_train, label_test = train_test_split(VectorArray, labels, test_size=0.2, random_state=7)

721 721


In [23]:
TitleString
TextString

0      need james bond recogn fact modern american pr...
1      cnn japanese prime minister shinzo abe insist ...
2      london cnn outbreak novel coronavirus caus pol...
3      chat us facebook messenger find happen world u...
4      london cnn business company spent year sinc gl...
                             ...                        
897    hanoi detail emerg suggest president trump wal...
898    viral drinking game involves consum shot everi...
899    london trump tweet excitement becom first amer...
900    accumul many assum damn evidence ever sit pres...
901    fussa president united state begun tour asia m...
Name: TextString, Length: 902, dtype: object

In [63]:
title_train, title_test, text_train, text_test, label_train, label_test = train_test_split(TitleString, TextString, labels, test_size=0.2, random_state=7)

In [64]:
titleVector = TfidfVectorizer()
titleVectorArray = titleVector.fit_transform(title_train).toarray()
titleVectorArray.shape

(721, 2651)

In [65]:
textVector = TfidfVectorizer()
textVectorArray = textVector.fit_transform(text_train).toarray()
textVectorArray.shape

(721, 15055)

In [66]:
vector = np.concatenate((titleVectorArray, textVectorArray), axis=1)
vector.shape

(721, 17706)

In [67]:
##GridSearch
param_grid=dict(alpha=[0.01,0.02,0.03,0.001,0.002,0.003,0.0001,0.0002,0.0003])
print(param_grid)
nb=MultinomialNB()
grid = GridSearchCV(nb, param_grid, cv=10,return_train_score=False)
grid.fit(vector,label_train)

{'alpha': [0.01, 0.02, 0.03, 0.001, 0.002, 0.003, 0.0001, 0.0002, 0.0003]}




GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.01, 0.02, 0.03, 0.001, 0.002, 0.003, 0.0001, 0.0002, 0.0003]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [68]:
grid.cv_results_

{'mean_fit_time': array([0.08938956, 0.09128578, 0.08325143, 0.08297076, 0.08333082,
        0.08491924, 0.08272619, 0.08228238, 0.08427498]),
 'std_fit_time': array([0.00655918, 0.00707767, 0.00414112, 0.0033143 , 0.00252898,
        0.00594827, 0.00318342, 0.00135652, 0.00149597]),
 'mean_score_time': array([0.00361154, 0.00221179, 0.0023082 , 0.00281231, 0.00249393,
        0.0025104 , 0.00231433, 0.00269392, 0.0025934 ]),
 'std_score_time': array([0.00292444, 0.00339817, 0.00288371, 0.00283418, 0.00091981,
        0.00345733, 0.00354823, 0.0006387 , 0.00048862]),
 'param_alpha': masked_array(data=[0.01, 0.02, 0.03, 0.001, 0.002, 0.003, 0.0001, 0.0002,
                    0.0003],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.01},
  {'alpha': 0.02},
  {'alpha': 0.03},
  {'alpha': 0.001},
  {'alpha': 0.002},
  {'alpha': 0.003},
  {'alpha': 0.0001},
  {

In [69]:
print("best score:",grid.best_score_)
print("best parameter:",grid.best_params_)

best score: 0.8904299583911235
best parameter: {'alpha': 0.002}


In [70]:
model = MultinomialNB(alpha=0.002)
model.fit(vector, label_train)

MultinomialNB(alpha=0.002, class_prior=None, fit_prior=True)

In [71]:
titleTestVector = titleVector.transform(title_test).toarray()
textTestVector = textVector.transform(text_test).toarray()
print(titleTestVector.shape, textTestVector.shape)
VectorTest = np.concatenate((titleTestVector, textTestVector), axis=1)
VectorTest.shape

(181, 2651) (181, 15055)


(181, 17706)

In [72]:
result = model.predict(VectorTest)
print("Accu: ", accuracy_score(label_test, result))

Accu:  0.9171270718232044


In [18]:
with open("pickles/model", "wb") as pick:
    pickle.dump(model, pick)

In [15]:
with open("pickles/titleVector", "wb") as pick:
    pickle.dump(titleVector, pick)

In [16]:
with open("pickles/textVector", "wb") as pick:
    pickle.dump(textVector, pick)