# Training your own classifier 

## Preprocessing

In [1]:
import re
import pandas as pd
import nltk

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head(1)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral


In [4]:
data.dropna(subset=['text'], inplace=True)

## Remove punctuation

Punctuation doesn't provide any useful information about the sentiment of a piece of text so it should be removed to simplify the training process.

In [5]:
data.text = data.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

## Remove stopwords

Stopwords are words in the english language that aren't very meaningful and can be easily skipped when conducting sentiment analysis. It would make the process faster if the stopwords were removed.

In [6]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
data.text = data.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

## Remove links

Link addresses don't provide much information about its contents so it isn't useful in sentiment analysis and hence should be removed.

In [7]:
data.text = data.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

## Lemmatization

Lemmatization was the method chosen as it is less crude than the stemming method and seems to be more reliable.

In [8]:
lemmatizer = nltk.WordNetLemmatizer()
data.text = data.text.apply(lambda x: ' '.join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
        ) )

## Training a naive bayes sentiment classifier

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

In [10]:
vectorizer = CountVectorizer()

In [11]:
data.dropna(subset=['text'], inplace=True)
X = vectorizer.fit_transform(data.text)

In [12]:
nb = MultinomialNB()

#nb.fit(features_matrix, item_we_want_to_predict)
nb.fit(X, data.sentiment)

MultinomialNB()

# Test dataset

In [13]:
test_data = pd.read_csv('test.csv')

In [14]:
test_data.dropna(subset=['text'], inplace=True)

## Data Cleaning

In [15]:
test_data.text = test_data.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

In [16]:
test_data.text = test_data.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

In [17]:
test_data.text = test_data.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

## Lemmatization

In [18]:
test_data.text = test_data.text.apply(lambda x: ' '.join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
        ) )

## Generating predictions

In [19]:
test_data.dropna(subset=['text'], inplace=True)
test_X = vectorizer.transform(test_data.text)

In [20]:
predicted = nb.predict(test_X)

In [21]:
prediction_data = pd.read_csv('test.csv')

In [22]:
prediction_data = prediction_data.assign(sentiment = predicted)

In [23]:
del prediction_data['text']

In [24]:
prediction_data.to_csv('Prediction data.csv',index=False)