In [1]:
import numpy as np
import scipy
import sklearn
import spacy
import nltk
import pandas as pd

In [213]:
pd.set_option('display.max_colwidth', -1)

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC

In [3]:
nlp = spacy.load('en')

In [4]:
with open('anti_statements.txt', 'r') as f:
    anti = f.readlines()

In [5]:
with open('pro_statements.txt', 'r') as f:
    pro = f.readlines()

In [6]:
len(anti), len(pro)

(84, 769)

In [7]:
def preprocess(sent):
    sent = sent.lower()
    sent = " ".join([token.text for token in nlp(sent)])
    return sent

In [18]:
val = {'pro':0,'anti':1}
label = {0:'pro', 1:'anti'}

In [27]:
df_anti = pd.DataFrame({'statement':[preprocess(sent) for sent in anti], 'labels':list(np.ones(len(anti)))})
df_pro = pd.DataFrame({'statement':[preprocess(sent) for sent in pro], 'labels':list(np.zeros(len(pro)))})

In [202]:
df_pro.sample(frac=1).reset_index(drop=True)
df = df_anti.append(df_pro.head(len(anti)), ignore_index=True)
df.sample(frac=1).reset_index(drop=True)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33)
for train_index, test_index in sss.split(df['statement'], df['labels']):
    X_train_, X_test_ = df['statement'][train_index], df['statement'][test_index]
    y_train_, y_test_ =  df['labels'][train_index], df['labels'][test_index]
    X_train, X_test, y_train, y_test = list(X_train_), list(X_test_), list(y_train_), list(y_test_)

In [203]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [204]:
len(vectorizer.get_feature_names())

960

In [205]:
# model = LogisticRegression().fit(X_train, y_train)
model = LinearSVC(loss='squared_hinge', tol=0.000001, C=20.0).fit(X_train, y_train)



In [206]:
model.score(X_train, y_train), model.score(X_test, y_test)

(1.0, 0.7678571428571429)

In [207]:
f1_score(y_true=y_test, y_pred=model.predict(X_test)), precision_score(y_true=y_test, y_pred=model.predict(X_test)), recall_score(y_true=y_test, y_pred=model.predict(X_test))

(0.7450980392156864, 0.8260869565217391, 0.6785714285714286)

In [210]:
preds = [label[p] for p in model.predict(X_test)]

In [211]:
df_predictions = pd.DataFrame({'statement': X_test_, 'actual':[label[p] for p in y_test] ,'predicted': preds})

In [216]:
df_predictions

Unnamed: 0,statement,actual,predicted
7,yoga strengthens humans from within so that the mankind is not left behind amid the fast pace of technological development \n,anti,anti
36,the future lay in technology - driven agricultural growth . \n,anti,pro
104,"bjp leaders and workers should use whatever latest technology is available to them , be it social media or otherwise , to tell people the true state of things . \n",pro,anti
34,"but we did not use science and technology in our development processes as much as we should have , at the inception ceremony of the indian science congresss centenary session , to be celebrated in january next year . \n",anti,pro
76,technology is so advanced that video footage can be edited to suit one 's convenience . \n,anti,pro
149,defence cooperation with france has been enhanced and the two countries would work together in manufacturing defence equipment and developing technology \n,pro,pro
123,"with the help of technology , bjp should get connected with youth \n",pro,pro
166,"digitisation was so central to his vision of governance and financial inclusion , advances in cyberspace and faster connectivity were making the world a flatter place , allowing less developed nations to emerge on a par with the more developed ones \n",pro,anti
151,"india now hopes to build a strong and resilient relationship with israel , and gain from israeli cutting - edge technology and defence \n",pro,pro
150,quality of argument and judgement will improve with technology being used actively \n,pro,pro
