In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')
import random
from nltk.corpus import stopwords
from sklearn import svm

## Step 1. Data Collection

In [2]:
# Reading dataset into dta
dta = pd.read_csv('cbdata1.csv', encoding = "ISO-8859-1")
dta.shape

(4050, 3)

## Step 2. Features Manipulation

In [3]:
def starts_num(input):
    m=re.match(r'^[0-9]',input)
    if m is not None:
        return 1
    else:
        return 0
dta['Starts_with_number'] = dta['title'].apply(lambda x: starts_num(x)) 

In [4]:
def spl_char(input):  
    input="".join(input.split())
    m=re.findall(r'[^a-zA-z0-9]',input)
    if m is not None:
        return len(re.findall(r"[^a-zA-z0-9,.']",input))
    else:
        return 0
dta['Number_of_special_characters'] = dta['title'].apply(lambda x: spl_char(x)) 

In [5]:
def title_len(input):  
    return len(input)
dta['Length_of_headline'] = dta['title'].apply(lambda x: title_len(x)) 

In [6]:
def number_words(input):  
    return len(input.split())
dta['Number_of_words'] = dta['title'].apply(lambda x: number_words(x))

In [7]:
def avg_len(input):  
    return len(input)/len(input.split())
dta['Avg_len_of_words'] = dta['title'].apply(lambda x: avg_len(x)) 

In [8]:
def stop_word(input):
    stop_words = set(stopwords.words('english'))
    count=0
    for i in input.lower().split():
        if i in stop_words:
            count=count+1
    return count
dta['stop_words'] = dta['title'].apply(lambda x: stop_word(x)) 

## Test Train split

In [9]:
train, test = train_test_split(dta, test_size = 0.2)

train.to_csv('new_train_data.csv', encoding = "ISO-8859-1")
test.to_csv('new_test_data.csv', encoding = "ISO-8859-1")

In [10]:
dta.isnull().values.sum()

0

In [11]:
train_df1=pd.read_csv('new_train_data.csv', encoding = "ISO-8859-1")
#train_df1.head()

In [12]:
test_df1=pd.read_csv('new_test_data.csv', encoding = "ISO-8859-1")
test_df1.head()

Unnamed: 0.1,Unnamed: 0,title,Number_of_tags,class_val,Starts_with_number,Number_of_special_characters,Length_of_headline,Number_of_words,Avg_len_of_words,stop_words
0,3656,E.W. Scripps Reports a Loss and Will Cut Salaries,3,0,0,0,49,9,5.444444,3
1,2856,Putinâs Grasp of Energy Drives Russian Agenda,5,0,0,3,47,7,6.714286,1
2,2852,Free Internet-Calling Services for Cellphones,7,0,0,1,45,5,9.0,1
3,612,21 Precious Moments Every Close Mother And Dau...,2,1,1,0,57,9,6.333333,1
4,281,This Was What The World Looked Like 30 Years Ago,10,1,0,0,48,10,4.8,4


## Pre-processing data

### Removing the Unnamed column from the new training and test set

In [13]:
train_df1.drop(train_df1.columns[0],axis=1, inplace=True)

In [14]:
test_df1.drop(test_df1.columns[0],axis=1, inplace=True)

In [15]:
train_features_df1=pd.DataFrame(np.array(train_df1[['Number_of_tags']]) , columns=['Number_of_tags'] )
train_features_df2=train_df1.ix[:,3:]
train_features=pd.concat([train_features_df1,train_features_df2],axis =1)

In [16]:
test_features_df1=pd.DataFrame(np.array(test_df1[['Number_of_tags']]) , columns=['Number_of_tags'] )
test_features_df2=test_df1.ix[:,3:]
test_features=pd.concat([test_features_df1,test_features_df2],axis =1)

In [17]:
#test_features.head()

### Creating Response Vector

In [18]:
#train_df1

In [19]:
response_vector=pd.DataFrame(np.array(train_df1['class_val']),columns=['class_val'])
response_vector.head(5)

Unnamed: 0,class_val
0,0
1,0
2,0
3,1
4,1


In [20]:
train_features.head()

Unnamed: 0,Number_of_tags,Starts_with_number,Number_of_special_characters,Length_of_headline,Number_of_words,Avg_len_of_words,stop_words
0,3,0,0,43,8,5.375,2
1,4,0,2,47,7,6.714286,1
2,5,0,7,65,12,5.416667,6
3,6,0,1,83,12,6.916667,4
4,3,0,0,81,17,4.764706,10


### Creating response vector and feature set

In [21]:
test_df1.isnull().values.sum()

0

In [22]:
train_df1.columns

Index(['title', 'Number_of_tags', 'class_val', 'Starts_with_number',
       'Number_of_special_characters', 'Length_of_headline', 'Number_of_words',
       'Avg_len_of_words', 'stop_words'],
      dtype='object')

## SVM classifier

In [23]:
 clf = svm.SVC(kernel='rbf')

In [24]:
response_vector=pd.DataFrame(np.array(train_df1['class_val']),columns=['class_val'])

In [25]:
clf=clf.fit(train_features,response_vector.values.ravel())

In [26]:
classfier_score=clf.score(train_features,response_vector.values.ravel())
classfier_score

0.88703703703703707

In [27]:
predict_cb=clf.predict(test_features)

In [28]:
cb = pd.DataFrame(columns=['title', 'class_val'])
cb['title'] = test_df1.title
cb['class_val'] = predict_cb.astype(int)
predicted_output=pd.concat([cb],axis=1)
predicted_output.to_csv('svm.csv',index=False)

In [29]:
test_cb=pd.DataFrame(columns=['title', 'class_val'])
test_cb['title'] = test_df1.title
test_cb['class_val']= test_df1.class_val
expected_output=pd.concat([test_cb], axis=1)
expected_output.to_csv('expected.csv', index=False)

In [30]:
lr=pd.read_csv('svm.csv', encoding = "ISO-8859-1")

In [31]:
exp=pd.read_csv('expected.csv', encoding = "ISO-8859-1")

In [32]:
print (classification_report(exp.class_val, lr.class_val))

             precision    recall  f1-score   support

          0       0.87      0.84      0.85       435
          1       0.82      0.86      0.84       375

avg / total       0.85      0.85      0.85       810



In [33]:
expected=test_df1.class_val

In [34]:
print (accuracy_score(exp.class_val, lr.class_val))

0.845679012346


## Randomization

In [35]:
n=random.randint(3,14)
if n%2 == 0:
    n=n-1
print(n)

7


In [36]:
sampled_df = {}
train_df = {}
response_vector = {}


In [37]:
for i in range(0,n):
    rows = np.random.choice(train_features.index.values, 100)
    sampled_df[i] = train_features.ix[rows]
    train_df[i] =train_df1.ix[rows]
    response_vector[i]=pd.DataFrame(np.array(train_df[i]['class_val']),columns=['class_val'])
    #response_vector[i]=train_df[i].class_val
 

In [38]:
response_vector[i].shape

(3000, 1)

In [39]:
cb = {}
predicted_output = {}

In [40]:
 clf = svm.SVC(kernel='rbf')

In [41]:
for i in range(0,n):
     clf = svm.SVC(kernel='rbf')
   #  clf=LogisticRegression()
     clf=clf.fit(sampled_df[i],response_vector[i].values.ravel())
     classfier_score=clf.score(sampled_df[i],response_vector[i].values.ravel())
     predict_cb=clf.predict(test_features)
     ctr=clf.predict(test_features).sum()
     cb[i] = pd.DataFrame(columns=['title', 'class_val'])
     cb[i]['title'] = test_df1.title
     cb[i]['class_val'] = predict_cb.astype(int)
     predicted_output[i]=pd.concat([cb[i]],axis=1)
     predicted_output[i].to_csv('svm[{}].csv'.format(i),index=False)
     #predicted_output[i].to_csv('svm[' + str(i) + ' ].csv'.format(i),index=False)

In [42]:
test_cb=pd.DataFrame(columns=['title', 'class_val'])
test_cb['title'] = test_df1.title
test_cb['class_val']= test_df1.class_val
expected_output=pd.concat([test_cb], axis=1)
expected_output.to_csv('cboutput.csv', index=False)

In [43]:
cb_final = pd.DataFrame(columns=['title', 'class_val'])
cb_final['title'] = test_cb.title

In [44]:
cb[0].isnull().values.sum()

0

In [45]:
cb[1].iloc[5].class_val

0

In [46]:
print(n)

7


In [47]:
a=n/2
print(a)

3.5


In [48]:
for i in range(0, len(cb[0])):
    count=0
    for j in range(0,n):
        if cb[j].iloc[i].class_val == 1:
            count=count+1
    if count>a:
        cb_final.iloc[i, cb_final.columns.get_loc('class_val')]=1
        #print(1)
    else:
        cb_final.iloc[i, cb_final.columns.get_loc('class_val')]=0
        #print(0)

In [49]:
cb_final.head()

Unnamed: 0,title,class_val
0,E.W. Scripps Reports a Loss and Will Cut Salaries,0
1,Putinâs Grasp of Energy Drives Russian Agenda,0
2,Free Internet-Calling Services for Cellphones,0
3,21 Precious Moments Every Close Mother And Dau...,0
4,This Was What The World Looked Like 30 Years Ago,1


In [50]:
#cb_final
type(cb_final.class_val)

pandas.core.series.Series

In [51]:
type(test_cb.class_val)

pandas.core.series.Series

In [52]:
cb_final.class_val.shape

(810,)

In [53]:
cb_final.to_csv('final.csv',index=False)

In [54]:
cf=pd.read_csv('final.csv', encoding = "ISO-8859-1")

In [55]:
test=pd.read_csv('cboutput.csv', encoding = "ISO-8859-1")

In [56]:
expected=test.class_val

In [57]:
print (classification_report(expected, cf.class_val))

             precision    recall  f1-score   support

          0       0.86      0.83      0.85       435
          1       0.81      0.84      0.83       375

avg / total       0.84      0.84      0.84       810



In [58]:
print (f1_score(expected, cf.class_val))

0.828309305374


In [59]:
print (accuracy_score(expected, cf.class_val))

0.838271604938


In [60]:
print (recall_score(expected, cf.class_val))

0.842666666667
