## Import Dependencies & Dataset

In [1]:
import os
import re
import json
import pandas as pd
import numpy as np

# NLP imports
import nltk
nltk.download('stopwords')  # unneeded words, shouldn't process
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Model imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brookemattos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Kaggle path configuration
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Fetch dataset through API (file too large to manually download)
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# Extract dataset from .zip to .csv
from zipfile import ZipFile
dataset = "sentiment140.zip"
with ZipFile(dataset, 'r') as zip:
    zip.extractall()
    print("... dataset extraction complete ...")

... dataset extraction complete ...


## Data Pre-processing & Observation

In [4]:
# Load csv to dataframe
df = pd.read_csv("twitter_sentiment_data.csv", encoding="ISO-8859-1")

In [5]:
df.shape   # Visualize size

(1599999, 6)

In [6]:
df.head()  # Visualize df

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [7]:
# Restructure - name the columns
column_names = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv("twitter_sentiment_data.csv", names=column_names, encoding="ISO-8859-1")
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
# Handle missing values - (no need to drop, no missing values in dataset)
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
# Ensure even distrubution of target {0 - 4} - (no sampling needed, distribution is even)
df["target"].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [10]:
# Change target=4 to 1 - (so, [0, 1])
df.replace({"target":{4:1}}, inplace=True)
df["target"].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

### <u style="color: goldenrod">DEFN -> [ 0: positive tweet | 1: negative tweet ]</u>

## Stemming

In [11]:
# Reduce all words to their respective root/key word
portStem = PorterStemmer()
stop_words = set(stopwords.words("english"))
def stemming(word):
    stemmed = re.sub("[^a-zA-Z]", " ", word)  # reduce to alphabetical
    stemmed = stemmed.lower()  # uniform (no definable influence to context)
    stemmed = stemmed.split()  # put all words into list
    stemmed = [portStem.stem(w) for w in stemmed if not w in stop_words]  # reduce to root
    stemmed = " ".join(stemmed)  # join back to tweet format
    return stemmed

df["stemmed"] = df["text"].apply(stemming)

In [12]:
# TOOK ONE MINUTE TO PROCESS

In [13]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


## Prepping the Model

In [14]:
# separate data and label (only need target & stemmed text)
x = df["stemmed"].values
y = df["target"].values
print(x)
print(y)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']
[0 0 0 ... 1 1 1]


In [15]:
# split training and test data (20% test data)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [16]:
# Convert stemmed data from textual to numerical
#    - convert all words to numerical id values
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

## Logistic Regression Model

In [17]:
# training the model
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

## Evaluate Model

In [18]:
# TRAINING ACCURACY
x_train_pred = model.predict(x_train)
train_acc = accuracy_score(y_train, x_train_pred)
print(f"Training accuracy: {train_acc}")

Training accuracy: 0.8102265625


In [19]:
# TESTING ACCURACY
x_test_pred = model.predict(x_test)
test_acc = accuracy_score(y_test, x_test_pred)
print(f"Test accuracy: {test_acc}")

Test accuracy: 0.777996875


# Model Accuracy: 78%