In [1]:
import pandas as pd
import nltk

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [3]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
print(train.shape, test.shape)

(20800, 5) (5200, 4)


In [5]:
train.fillna('', inplace=True)

In [6]:
train["total"] = train["title"] + " " + train["author"] + train["text"]

In [7]:
X_train = train["total"][:]
y_train = train["label"][:]

## Cleaning the data

In [8]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [9]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/illusion/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
for index, row in X_train.iteritems():
    row = re.sub(r'[^\w\s]', '', row)
    row = word_tokenize(row)
    row = [word for word in row if word not in stop_words]
    sentence = ""
    for word in row:
        sentence = sentence + " " + str(lemmatizer.lemmatize(word)).lower()
    X_train.loc[index] = sentence

In [11]:
print(X_train[:10])

0     house dem aide we didnt even see comeys lette...
1     flynn hillary clinton big woman campus breitb...
2     why truth might get you fired consortiumnewsc...
3     15 civilians killed in single us airstrike ha...
4     iranian woman jailed fictional unpublished st...
5     jackie mason hollywood would love trump he bo...
6     life life of luxury elton johns 6 favorite sh...
7     benoît hamon wins french socialist partys pre...
8     excerpts from draft script donald trumps qamp...
9     a backchannel plan ukraine russia courtesy tr...
Name: total, dtype: object


In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# c_vectorizer = CountVectorizer()
tf_vectorizer = TfidfVectorizer()
# X_train_c_vector = c_vectorizer.fit_transform(X_train)
X_train_tf_vector = tf_vectorizer.fit_transform(X_train)

In [13]:
# print(X_train_c_vector.shape)
print(X_train_tf_vector.shape)

(20800, 220387)


In [14]:
from sklearn.svm import SVC
c_svc = SVC()
# c_svc.fit(X_train_vector, y_train_1)

In [15]:
from sklearn import model_selection

In [16]:
# c_svc_cv_cvec = model_selection.cross_val_score(c_svc, X_train_c_vector, y_train, scoring="accuracy", cv=3, n_jobs=-1)
c_svc_cv_tfid = model_selection.cross_val_score(c_svc, X_train_tf_vector, y_train, scoring="accuracy", cv=3, n_jobs=-1)

In [17]:
# print(c_svc_cv_cvec.mean())
print(c_svc_cv_tfid.mean())

0.9702402989604804


In [18]:
from sklearn.pipeline import Pipeline
import joblib

In [19]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC())
])

In [20]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', SVC())])

In [21]:
pipeline.predict(["flynn hillary clinton big woman campus breitbart daniel j flynnever get feeling life circle roundabout rather head straight line toward intended destination hillary clinton remains big woman campus leafy liberal wellesley massachusetts everywhere else vote likely inauguration dress remainder day way miss havisham forever wore wedding dress speaking great expectations hillary rodham overflowed 48 year ago first addressed wellesley graduating class the president college informed gathered 1969 student needed debate far i could ascertain spokesman kind like democratic primary 2016 minus term unknown even seven sisters school i glad miss adams made clear i speaking today u 400 u miss rodham told classmate after appointing edger bergen charlie mccarthys mortimer snerds attendance bespectacled granny glass awarding matronly wisdom least john lennon wisdom took issue previous speaker despite becoming first win election seat u s senate since reconstruction edward brooke came criticism calling empathy goal protestors criticized tactic though clinton senior thesis saul alinsky lamented black power demagogue elitist arrogance repressive intolerance within new left similar word coming republican necessitated brief rebuttal trust rodham ironically observed 1969 one word i asked class rehearsal wanted say everyone came said talk trust talk lack trust u way feel others talk trust bust what say what say feeling permeates generation perhaps even understood distrusted the trust bust certainly busted clintons 2016 plan she certainly even understand people distrusted after whitewater travelgate vast conspiracy benghazi missing email clinton found distrusted voice friday there load compromising road broadening political horizon and distrust american people trump edged 48 percent 38 percent question immediately prior novembers election stood major reason closing horizon clinton described vanquisher supporter embracing lie con alternative fact assault truth reason she failed explain american people chose lie truth as history major among today know well people power invent fact attack question mark beginning end free society offered that hyperbole like many people emerge 1960s hillary clinton embarked upon long strange trip from high school goldwater girl wellesley college republican president democratic politician clinton drank time place gave degree more significantly went idealist cynic comparison two wellesley commencement address show way back lamented long leader viewed politics art possible challenge practice politics art making appears impossible possible now big woman campus odd woman white house wonder current station even possible why arent i 50 point ahead asked september in may asks isnt president the woman famously dubbed congenital liar bill safire concludes lie mind getting stood election day like finding jilted bride wedding day inspires dangerous delusion"])

array([0])

In [22]:
filename = "pipeline.sav"
joblib.dump(pipeline, filename)

['pipeline.sav']

## Load Pipeline

In [23]:
loaded_model = joblib.load(filename)
result = loaded_model.predict(["flynn hillary clinton big woman campus breitbart daniel j flynnever get feeling life circle roundabout rather head straight line toward intended destination hillary clinton remains big woman campus leafy liberal wellesley massachusetts everywhere else vote likely inauguration dress remainder day way miss havisham forever wore wedding dress speaking great expectations hillary rodham overflowed 48 year ago first addressed wellesley graduating class the president college informed gathered 1969 student needed debate far i could ascertain spokesman kind like democratic primary 2016 minus term unknown even seven sisters school i glad miss adams made clear i speaking today u 400 u miss rodham told classmate after appointing edger bergen charlie mccarthys mortimer snerds attendance bespectacled granny glass awarding matronly wisdom least john lennon wisdom took issue previous speaker despite becoming first win election seat u s senate since reconstruction edward brooke came criticism calling empathy goal protestors criticized tactic though clinton senior thesis saul alinsky lamented black power demagogue elitist arrogance repressive intolerance within new left similar word coming republican necessitated brief rebuttal trust rodham ironically observed 1969 one word i asked class rehearsal wanted say everyone came said talk trust talk lack trust u way feel others talk trust bust what say what say feeling permeates generation perhaps even understood distrusted the trust bust certainly busted clintons 2016 plan she certainly even understand people distrusted after whitewater travelgate vast conspiracy benghazi missing email clinton found distrusted voice friday there load compromising road broadening political horizon and distrust american people trump edged 48 percent 38 percent question immediately prior novembers election stood major reason closing horizon clinton described vanquisher supporter embracing lie con alternative fact assault truth reason she failed explain american people chose lie truth as history major among today know well people power invent fact attack question mark beginning end free society offered that hyperbole like many people emerge 1960s hillary clinton embarked upon long strange trip from high school goldwater girl wellesley college republican president democratic politician clinton drank time place gave degree more significantly went idealist cynic comparison two wellesley commencement address show way back lamented long leader viewed politics art possible challenge practice politics art making appears impossible possible now big woman campus odd woman white house wonder current station even possible why arent i 50 point ahead asked september in may asks isnt president the woman famously dubbed congenital liar bill safire concludes lie mind getting stood election day like finding jilted bride wedding day inspires dangerous delusion"])
result

array([0])