In [20]:
import re
import string

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import STOPWORDS

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import joblib

SEED = 42

In [3]:
DATASET_COLUMNS = ['Sentiment', 'Id', 'Date', 'Query', 'User', 'Text']
DATASET_ENCODING = 'ISO-8859-1'

In [4]:
df = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
print(f"Shape of the data: {df.shape}")
df.head()

Shape of the data: (1600000, 6)


Unnamed: 0,Sentiment,Id,Date,Query,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df['Sentiment'] = df['Sentiment'].map({0: 'Negative', 4: 'Positive'})
df.drop(['Id', 'Query'], axis=1, inplace=True)

In [9]:
stopwords_en = set(stopwords.words('english')) | STOPWORDS
wnl = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    words = [wnl.lemmatize(word) for word in text.split() if word not in stopwords_en]
    text = ' '.join(words)

    text = re.sub('@\[A-Za-z0-9\]+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [10]:
X = df['Text'].apply(clean_text)
y = df['Sentiment']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [22]:
model = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)
model.fit(X_train, y_train)

In [23]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.761121875


In [24]:
joblib.dump(model, 'models/model.joblib')

['models/model.joblib']