In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.tokenize.toktok import ToktokTokenizer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayusharora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ayusharora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayusharora/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ayusharora/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [7]:
stop_words = set(stopwords.words('english'))
stop_words.remove('no')
stop_words.remove('not')

In [8]:
def remove_stop_words(text):
    word_tokens = word_tokenize(text) 
    filtered = [token for token in word_tokens if token.lower() not in stop_words]
    filtered = ' '.join(filtered)
    return filtered

def remove_special_characters(text,remove_digits = False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern,'',text)
    return text

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word)for word in text.split()])
    return text
def lemmatize(text):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word)for word in text.split()])
    return text


In [9]:
for i, row in train.iterrows():
    new_text = remove_stop_words(train.at[i,'text'])
    new_text = remove_special_characters(train.at[i,'text'], remove_digits = True)
    new_text = lemmatize(new_text)
    new_text = simple_stemmer(new_text)
    train.at[i,'text'] = new_text

In [10]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['text'], train['target'])

In [11]:
tf_idf_vectorizer = TfidfVectorizer(use_idf=True)
tf_idf_vectorizer.fit(train['text'])
train_vectors = tf_idf_vectorizer.transform(train_x)
valid_vectors = tf_idf_vectorizer.transform(valid_x)

In [12]:
all_train = tf_idf_vectorizer.transform(train['text'])
all_train_y = train['target']
test_vectors = tf_idf_vectorizer.transform(test['text'])

In [13]:
all_train_y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [14]:
def train_model(classifier,train_vectors,target,valid_vectors):
    classifier.fit(train_vectors,target)
    predictions = classifier.predict(valid_vectors)
    return metrics.accuracy_score(predictions,valid_y)

In [15]:
nv_accuracy = train_model(naive_bayes.MultinomialNB(),train_vectors,train_y,valid_vectors)
nv_accuracy

0.7951680672268907

In [16]:
log_reg_accuracy = train_model(linear_model.LogisticRegression(),train_vectors,train_y,valid_vectors)
log_reg_accuracy



0.7925420168067226

In [17]:
svm_boost_accuracy = train_model(svm.SVC(),train_vectors,train_y,valid_vectors)
svm_boost_accuracy



0.5698529411764706

In [18]:
def test_model(classifier,all_train,target,test_vectors):
    classifier.fit(all_train,target)
    predictions = classifier.predict(test_vectors)
    return predictions

In [28]:
predictions = test_model(svm.SVC(),all_train,all_train_y,test_vectors)



In [29]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['target'] = predictions

In [31]:
sample_submission.to_csv('submisssion.csv',index = False)