In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
import itertools
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
import spacy
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import string
import re
import nltk
import collections
from nltk.corpus import stopwords
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from empath import Empath
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
df = pd.read_csv('Dataset/data.csv')
df.loc[df['Label']== 0, 'Label'] = 'REAL'
df.loc[df['Label']== 1, 'Label'] = 'FAKE'
df.columns
df['Label'].value_counts()

In [None]:
#Dropping the column URLs from the table
df.drop(['URLs'], axis = 1, inplace = True)
df.columns

In [None]:
#Selecting only fake news from all the types of news and then replacing the 'fake' by 0
df1 = pd.read_csv('Dataset/fake.csv')
df1.columns
df1['type'].value_counts()
df1 = df1.loc[df1['type']=='fake']
df1.loc[df1['type']== 'fake', 'type'] = 'FAKE'

In [None]:
#Selecting some columns from the table and renaming them\n",
df1 = df1[['title','text','type']]
df1.columns = ['Headline', 'Body', 'Label']
df1['Label'].value_counts()

In [None]:
df2 = pd.read_csv('Dataset/fake_or_real_news.csv')
df2.columns

In [None]:
#Selecting few columns from the table and renaming the columns
df2 = df2[['title','text','label']]
df2.columns = ['Headline', 'Body', 'Label']
df2.columns
df2['Label'].value_counts()

In [None]:


df3 = pd.read_csv('Dataset/train.csv')
df3.columns



In [None]:
#Selecting few columns from the table and renaming the columns
df3 = df3[['title','text','label']]
df3.columns = ['Headline', 'Body', 'Label']
df3.loc[df3['Label']== 0, 'Label'] = 'REAL'
df3.loc[df3['Label']== 1, 'Label'] = 'FAKE'
df3.columns
df3['Label'].value_counts()

In [None]:
#Appending df1,df2,df3 to df
df = df.append(df1, ignore_index = True)
df = df.append(df2, ignore_index = True)
df = df.append(df3, ignore_index = True)

In [None]:
df = df.drop_duplicates()

# df.iloc[3647]
# print(df['Headline'][3647])
# print(len(df['Body'][3647]))
#df = df.dropna(how='any',axis=0)
cnt = 0
ind = []
for art in df['Body']:
    #print(type(art))
    if len(str(art)) < 10:
        ind.append(cnt)
    cnt+=1
df = df.drop(df.index[ind])
        

df

In [None]:
df['Label'].value_counts()

In [None]:
df['Label'].value_counts().plot(kind = 'bar')

In [None]:
df['headline_length'] = [len(str(a)) for a in df['Headline']]
df['headline_length'].describe()

In [None]:
df['body_length'] = [len(a) for a in df['Body']]
df['body_length'].describe()

In [None]:
df.describe()

In [None]:
df["Text"] = df["Headline"].map(str) + df["Body"]
y = df.Label
y = y.astype('str')
X_train, X_test, Y_train, Y_test = train_test_split(df['Text'],y, test_size=0.33)
X_train

In [None]:
#Tf-idf Bigrams
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2)) 

# Fit and transform the training data 
tfidf1_train = tfidf_vectorizer.fit_transform(X_train.astype('str')) 

# Transform the test set 
tfidf1_test = tfidf_vectorizer.transform(X_test.astype('str'))

pickle.dump(tfidf1_train, open("tfidf1_train.pickle", "wb"))

pickle.dump(tfidf1_test, open("tfidf1_test.pickle", "wb"))

In [None]:
#Top 10 tfidf bigrams 
tfidf_vectorizer.get_feature_names()[-10:]

In [None]:
tfidf1_train

In [None]:
#Confusion Matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
clf = MultinomialNB()
clf.fit(tfidf1_train, Y_train)
pickle.dump(clf, open('tfidf_nb', 'wb'))
pred = clf.predict(tfidf1_test)
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy with Multinomial Naive Bayes:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(Y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = GradientBoostingClassifier()
clf.fit(tfidf1_train, Y_train)
pickle.dump(clf, open('tfidf_gb', 'wb'))
#model = pickle.load(open('tfidf_gb', 'rb'))
pred = clf.predict(tfidf1_test)
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy with Gradient Boosting:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(Y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = RandomForestClassifier()
clf.fit(tfidf1_train, Y_train)
pickle.dump(clf, open('tfidf_rf', 'wb'))
pred = clf.predict(tfidf1_test)
score = metrics.accuracy_score(Y_test, pred)
print("Accuracy with RandomForestClassifier:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(Y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
#Generating the POS tags for all the articles and adding a new column by replacing text with their POS tags
nlp = spacy.load('en_core_web_sm')
x = []
df["Text"] = df["Headline"].map(str) + df["Body"]
for text in df['Text']:
    text_new = []
    doc = nlp(text)
    for token in doc:
        text_new.append(token.pos_)
    txt = ' '.join(text_new)
    x.append(txt)
df['Text_pos'] = x
df.to_pickle('newdata.pkl')

In [None]:
df = pd.read_pickle('newdata.pkl')
cnt = 0
ind = []
for art in df['Body']:
    #print(type(art))
    if len(str(art)) < 10:
        ind.append(cnt)
    cnt+=1
df = df.drop(df.index[ind])

In [None]:
y = df.Label
y = y.astype('str')
x_train, x_test, y_train, y_test = train_test_split(df['Text_pos'],y, test_size=0.33)
x_train

In [None]:
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2)) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(x_train.astype('str')) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(x_test.astype('str'))

pickle.dump(tfidf_train, open("tfidf_train.pickle", "wb"))

pickle.dump(tfidf_test, open("tfidf_test.pickle", "wb"))

In [None]:
tfidf_vectorizer.get_feature_names()[-10:]

In [None]:
tfidf_train

In [None]:
clf = MultinomialNB()
clf.fit(tfidf_train, y_train)
pickle.dump(clf, open('pos_nb', 'wb'))
pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Multinomial Naive Bayes:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = RandomForestClassifier()
clf.fit(tfidf_train, y_train)
pickle.dump(clf, open('pos_rf', 'wb'))
pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with RandomForestClassifier:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = GradientBoostingClassifier()
clf.fit(tfidf_train, y_train)
pickle.dump(clf, open('pos_gb', 'wb'))
pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Gradient Boosting:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
#Getting the score of semantic categories generated by Empath of each article and generating a tfidf vector of the unigrams 
lexicon = Empath()
semantic = []
cnt = 0
df["Text"] = df["Headline"].map(str) + df["Body"]

for article in df['Text']:
    if article == '':
        continue
    cnt+=1
    d = lexicon.analyze(article, normalize = False)
    x = []
    for key, value in d.items():
        x.append(value)
    x = np.asarray(x)
    semantic.append(x)
df['Semantic'] = semantic

In [None]:
categories = []
a = lexicon.analyze("")
for key, value in a.items():
    categories.append(key)
categories

In [None]:
#TF-IDF vector by taking the score for a semantic class as its frequency.
sem = []
for i in range(df.shape[0]):
    a = []
    for j in range(len(semantic[0])):
        for k in range(int(semantic[i][j])):
            a.append(categories[j])
    b = " ".join(a)
    sem.append(b)
#print(len(sem))
df['Semantics'] = sem
df.to_pickle('Semantic.pkl')

In [None]:
df = pd.read_pickle('Semantic.pkl')
print(df.columns)
print(df.shape)

In [None]:
y = df.Label
y = y.astype('str')
x_train, x_test, y_train, y_test = train_test_split(df['Semantics'],y, test_size=0.33)
x_train

In [None]:
print(type(x_train))
print(x_train.shape)

In [None]:
#Initialize the `tfidf_vectorizer` 
tfidf2_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,1)) 

# Fit and transform the training data 
tfidf2_train = tfidf2_vectorizer.fit_transform(x_train.astype('str')) 

# Transform the test set 
tfidf2_test = tfidf2_vectorizer.transform(x_test.astype('str'))

pickle.dump(tfidf2_train, open("tfidf2_train.pickle", "wb"))

pickle.dump(tfidf2_test, open("tfidf2_test.pickle", "wb"))

In [None]:
clf = MultinomialNB()
#type(x_train.tolist())
clf.fit(x_train.tolist(), y_train)
pickle.dump(clf, open('sem_nb', 'wb'))
pred = clf.predict(x_test.tolist())
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Multinomial Naive Bayes:   %0.3f" % score)

In [None]:
clf = RandomForestClassifier()
clf.fit(x_train.tolist(), y_train)
pickle.dump(clf, open('sem_rf', 'wb'))
pred = clf.predict(x_test.tolist())
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with RandomForestClassifier:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = GradientBoostingClassifier()
clf.fit(x_train.tolist(), y_train)
pickle.dump(clf, open('sem_gb', 'wb'))
pred = clf.predict(x_test.tolist())
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Gradient Boosting:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
#Combining the 3 feature vectors
import scipy.sparse as sp
# ui = sp.vstack(tfidf_train, tfidf1_train)
# yu = tfidf_train.data.tolist()
# yu.append(tfidf1_train.tolist())
# test = tfidf_test.data.tolist() + x_test.tolist()
#print(type(tfidf_train), tfidf_train.shape)
#print(type(tfidf1_train), tfidf1_train.shape)
# print(type(x_train), x_train.shape)
diff_n_rows = tfidf_train.shape[0] - tfidf1_train.shape[0]

Xb_new = sp.vstack((tfidf1_train, sp.csr_matrix((diff_n_rows, tfidf1_train.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

c = sp.hstack((tfidf_train, Xb_new))

diff_n_rows = c.shape[0] - tfidf2_train.shape[0]

Xb_new = sp.vstack((tfidf2_train, sp.csr_matrix((diff_n_rows, tfidf2_train.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

X = sp.hstack((c, Xb_new))
X

dif_n_rows = tfidf_test.shape[0] - tfidf1_test.shape[0]

Xb_ne = sp.vstack((tfidf1_test, sp.csr_matrix((dif_n_rows, tfidf1_test.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

d = sp.hstack((tfidf_test, Xb_ne))

dif_n_rows = d.shape[0] - tfidf2_test.shape[0]

Xb_ne = sp.vstack((tfidf2_test, sp.csr_matrix((dif_n_rows, tfidf2_test.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

Y = sp.hstack((d, Xb_ne))

In [None]:
clf = MultinomialNB()
#print(type(train), type(y_train.tolist()))
clf.fit(X, y_train)
pickle.dump(clf, open('pos_sem_nb', 'wb'))
pred = clf.predict(Y)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Multinomial Naive Bayes:   %0.3f" % score)

In [None]:
clf = RandomForestClassifier()
clf.fit(X, y_train)
pickle.dump(clf, open('pos_sem_rf', 'wb'))
pred = clf.predict(Y)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with RandomForestClassifier:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X, y_train)
pickle.dump(clf, open('pos_sem_gb', 'wb'))
pred = clf.predict(Y)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Gradient Boosting:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:


#Directly loading the final dateframe by loading the pickle file from the previously saved pickle file
df = pd.read_pickle('Semantic.pkl')
print(df.columns)
print(df.shape)



In [None]:
y = df.Label
x_train, x_test, y_train, y_test = train_test_split(df,y, test_size=0.33)

In [None]:
x_train_text = x_train['Text']
x_test_text = x_test['Text']
x_train_text_pos = x_train['Text_pos']
x_test_text_pos = x_test['Text_pos']
x_train_semantics = x_train['Semantics']
x_test_semantics = x_test['Semantics']

In [None]:
#Tf-idf Bigrams
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2), max_features = 20000) 

# Fit and transform the training data 
tfidf1_train = tfidf_vectorizer.fit_transform(x_train_text.astype('str')) 

# Transform the test set 
tfidf1_test = tfidf_vectorizer.transform(x_test_text.astype('str'))

pickle.dump(tfidf1_train, open("tfidf1_train.pickle", "wb"))

pickle.dump(tfidf1_test, open("tfidf1_test.pickle", "wb"))

In [None]:
#POS
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2)) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(x_train_text_pos.astype('str')) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(x_test_text_pos.astype('str'))

pickle.dump(tfidf_train, open("tfidf_train.pickle", "wb"))

pickle.dump(tfidf_test, open("tfidf_test.pickle", "wb"))

In [None]:
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,1)) 

# Fit and transform the training data 
tfidf2_train = tfidf_vectorizer.fit_transform(x_train_semantics.astype('str')) 

# Transform the test set 
tfidf2_test = tfidf_vectorizer.transform(x_test_semantics.astype('str'))

pickle.dump(tfidf2_train, open("tfidf_train.pickle", "wb"))

pickle.dump(tfidf2_test, open("tfidf_test.pickle", "wb"))

In [None]:
ttf1_train = tfidf1_train
ttf1_test = tfidf1_test
ttf_train = tfidf_train
ttf_test = tfidf_test
ttf2_train = tfidf2_train
ttf2_test = tfidf2_test

In [None]:
#Giving weights to each of the 3 feature vectors generated
big_w = 0.35
synt_w = 0.5
sem_w = 0.15
big_w *= 3
synt_w *= 3
sem_w *= 3
tfidf1_train = big_w*ttf1_train
tfidf1_test = big_w*ttf1_test
tfidf_train = synt_w*ttf_train
tfidf_test = synt_w*ttf_test
tfidf2_train = sem_w*ttf2_train
tfidf2_test = sem_w*ttf2_test

In [None]:
import scipy.sparse as sp
# ui = sp.vstack(tfidf_train, tfidf1_train)
# yu = tfidf_train.data.tolist()
# yu.append(tfidf1_train.tolist())
# test = tfidf_test.data.tolist() + x_test.tolist()
#print(type(tfidf_train), tfidf_train.shape)
#print(type(tfidf1_train), tfidf1_train.shape)
# print(type(x_train), x_train.shape)
diff_n_rows = tfidf_train.shape[0] - tfidf1_train.shape[0]

Xb_new = sp.vstack((tfidf1_train, sp.csr_matrix((diff_n_rows, tfidf1_train.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

c = sp.hstack((tfidf_train, Xb_new))

diff_n_rows = c.shape[0] - tfidf2_train.shape[0]

Xb_new = sp.vstack((tfidf2_train, sp.csr_matrix((diff_n_rows, tfidf2_train.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

X = sp.hstack((c, Xb_new))
X

dif_n_rows = tfidf_test.shape[0] - tfidf1_test.shape[0]

Xb_ne = sp.vstack((tfidf1_test, sp.csr_matrix((dif_n_rows, tfidf1_test.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

d = sp.hstack((tfidf_test, Xb_ne))

dif_n_rows = d.shape[0] - tfidf2_test.shape[0]

Xb_ne = sp.vstack((tfidf2_test, sp.csr_matrix((dif_n_rows, tfidf2_test.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

Y = sp.hstack((d, Xb_ne))

In [None]:
clf = MultinomialNB()
#type(x_train.tolist())
clf.fit(X, y_train)
pickle.dump(clf, open('bi_pos_sem_nb', 'wb'))
pred = clf.predict(Y)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Multinomial Naive Bayes:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = RandomForestClassifier()
clf.fit(X, y_train)
pickle.dump(clf, open('bi_pos_sem_rf', 'wb'))
pred = clf.predict(Y)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with RandomForestClassifier:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X, y_train)
pickle.dump(clf, open('pos_gb', 'wb'))
pred = clf.predict(Y)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy with Gradient Boosting:   %0.3f" % score)

In [None]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

In [None]:
#For testing any new article 
a = (open('a.txt'))
x_test = a.read()

In [None]:
#Tf-idf Bigrams
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2), max_features = 20000) 

# Fit and transform the training data 
tfidf1_train = tfidf_vectorizer.fit_transform(x_train_text.astype('str')) 

# Transform the test set 
tfidf1_test = tfidf_vectorizer.transform([x_test])

In [None]:
nlp = spacy.load('en_core_web_sm')
x = []
text_new = []
doc = nlp(x_test)
for token in doc:
    text_new.append(token.pos_)
txt = ' '.join(text_new)
txt

In [None]:
#Tf-idf Bigrams
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2)) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(x_train_text_pos.astype('str')) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform([x_test])

In [None]:
categories = []
a = lexicon.analyze("")
for key, value in a.items():
    categories.append(key)
categories
lexicon = Empath()
semantic = []
cnt = 0
d = lexicon.analyze(x_test)
d
sem = []
for key,value in d.items() :
    sem.append(value)
a = []
for j in range(len(sem)):
    for k in range(int(sem[j])):
        a.append(categories[j])
    b = " ".join(a)
b

In [None]:
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,1)) 

# Fit and transform the training data 
tfidf2_train = tfidf_vectorizer.fit_transform(x_train_semantics.astype('str')) 

# Transform the test set 
tfidf2_test = tfidf_vectorizer.transform([b])

In [None]:
import scipy.sparse as sp
# ui = sp.vstack(tfidf_train, tfidf1_train)
# yu = tfidf_train.data.tolist()
# yu.append(tfidf1_train.tolist())
# test = tfidf_test.data.tolist() + x_test.tolist()
#print(type(tfidf_train), tfidf_train.shape)
#print(type(tfidf1_train), tfidf1_train.shape)
# print(type(x_train), x_train.shape)
diff_n_rows = tfidf_train.shape[0] - tfidf1_train.shape[0]

Xb_new = sp.vstack((tfidf1_train, sp.csr_matrix((diff_n_rows, tfidf1_train.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

c = sp.hstack((tfidf_train, Xb_new))

diff_n_rows = c.shape[0] - tfidf2_train.shape[0]

Xb_new = sp.vstack((tfidf2_train, sp.csr_matrix((diff_n_rows, tfidf2_train.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

X = sp.hstack((c, Xb_new))
X

dif_n_rows = tfidf_test.shape[0] - tfidf1_test.shape[0]

Xb_ne = sp.vstack((tfidf1_test, sp.csr_matrix((dif_n_rows, tfidf1_test.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

d = sp.hstack((tfidf_test, Xb_ne))

dif_n_rows = d.shape[0] - tfidf2_test.shape[0]

Xb_ne = sp.vstack((tfidf2_test, sp.csr_matrix((dif_n_rows, tfidf2_test.shape[1])))) 
#where diff_n_rows is the difference of the number of rows between Xa and Xb

Y = sp.hstack((d, Xb_ne))

In [None]:
clf = MultinomialNB()
#type(x_train.tolist())
clf.fit(X, y_train)
clf.predict(Y)