In [None]:
import pandas as pd
import os
import spacy
from gensim import matutils,corpora, models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import codecs

# We have positive reviews and negative reviews in separate directories

In [None]:
negative_list = os.listdir("negative_reviews") # names of all files in the negative_polarity dir into a list
positive_list = os.listdir("positive_reviews") # names of all files in the positive_polarity dir into a list

In [None]:
import re
def preprocess(files_list,root_dir,polarity):
    labeled_class = []
    reviews = []
    actual_class =[]
    for j in files_list:
        labeled_class.append(polarity)
        newj = re.sub("^\.\_","",j)
#        print(newj)
        k = str(open(root_dir + '/' + newj).read())
        reviews.append(k)
        actual_class.append(str(newj.split('_')[0]))
    data = pd.DataFrame({'labeled_class':labeled_class,'review':reviews,'actual_class':actual_class})
    return data

In [None]:
negative_df = preprocess(negative_list,'negative_reviews','negative')
positive_df = preprocess(positive_list,'positive_reviews','positive')

In [None]:
#negative_df.head()

In [None]:
#positive_df.head()

We have Positive Feedback -> True,Fake Review

We have Negative Feedback -> True,Fake Review

In [None]:
target = []
for i in positive_df.index:
    if ((positive_df['labeled_class'][i] == 'positive') & (positive_df['actual_class'][i] == 't')):
        target.append(2)
    elif ((positive_df['labeled_class'][i] == 'positive') & (positive_df['actual_class'][i] == 'd')):
        target.append(1)
    else:
        print('Error!')
positive_df['target'] = target

In [None]:
target = []
for i in negative_df.index:
    if ((negative_df['labeled_class'][i] == 'negative') & (negative_df['actual_class'][i] == 't')):
        target.append(3)
    elif ((negative_df['labeled_class'][i] == 'negative') & (negative_df['actual_class'][i] == 'd')):
        target.append(4)
    else:
        print('Error!')
negative_df['target'] = target

In [None]:
data = positive_df.merge(negative_df,how='outer')

In [None]:
data = data[['review','target']]

In [None]:
data.head()

In [None]:
data.target.value_counts()

In [None]:
from spacy.util import get_data_path

nlp = spacy.load('en_core_web_md')  

In [None]:
#pos_tags=[];
#g=[[]];
#for datapoint in data['review_tokenized']:
 #   s=""
  #  for j in datapoint:
   #     s=s+str(j)+"_"+j.pos_;
    #    s=s+","
    #t=s;    
    #pos_tags.append(t)  
    #g.append(pos_tags)
#print(g) 


In [None]:
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(data['review'].astype('unicode').values, batch_size=50,
                        n_threads=6):
    if doc.is_parsed:
        tokens.append(str([n.text for n in doc ]))
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)   
        
data['species_tokens'] = tokens
data['species_lemma'] = lemma
data['species_pos'] = pos

In [None]:
#data['species_tokens']

In [None]:
#data['species_pos'] = data['species_pos'].astype(str).apply(lambda x: ', '.join(data['species_pos'].astype(str)))

In [None]:
#data

In [None]:
col = []
for i,j in zip(data['species_tokens'], data['species_pos']):
   col.append([ x+'_'+ y for x,y in zip(eval(i),j)])

In [None]:
#col[:10]

In [None]:
data['review_tokenized'] = pd.Series(col)

In [None]:
data.head()

In [None]:
from gensim import matutils,corpora, models

def vectorize_comments(df):
    d = corpora.Dictionary(df["review_tokenized"])
    d.filter_extremes(no_below=3)
    d.compactify()
    corpus = [d.doc2bow(text) for text in df["review_tokenized"]]
    corpus = matutils.corpus2csc(corpus, num_terms=len(d.token2id))
    corpus = corpus.transpose()
    return d, corpus

dictionary,corpus = vectorize_comments(data)
print (corpus.shape)

In [None]:
def train_svm(X,y):
    parameters = {'C': [1000],'random_state':[42]}
    clf = GridSearchCV(SVC(), cv=10, param_grid=parameters)
    clf.fit(X, y)
    return clf

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(corpus, data["target"], test_size=0.3, random_state=2016)
svc_clf = train_svm(X_train,y_train)
svc_clf.fit=(X_train,y_train) 
print("Accuracy of SVM on test sets is : {}".format(svc_clf.score(X_test,y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
y_pred = svc_clf.predict(X_test)
confusion_matrix(y_pred,y_test)

In [None]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_test, y_pred)  

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svc_clf,corpus, data["target"], cv=10)

In [None]:
print(scores) 
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
def model_test(review):
    a = svc_clf.predict(review)
    if a == 1.0 :
        return('Fake Review (Positive)')
    elif a == 2.0:
        return('True Review (Positive)')
    elif a == 3.0:
        return('True Review (Negative)')
    else :
        return('Fake Review (Negative)')

In [None]:
for i in X_test:
    print(model_test(i)) 