### Importing the dataset

In [None]:
# API to fetch the dataset from kaggle
! kaggle datasets download -d kazanova/sentiment140

In [None]:
# Extracting the compressed dataset
from zipfile import ZipFile

dataset = 'sentiment140.zip'

with ZipFile(dataset, 'r') as zipped:
    zipped.extractall()
    print('Extracted all files from', dataset)

### Importing the dependencies

In [290]:
# Necessary libraries
import pandas as pd
import re
import pickle

# Natural Language Toolkit
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # e.g. and, the, a
from nltk.stem.porter import PorterStemmer # e.g. running -> run

# Machine Learning Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Exploring the dataset

In [145]:
# Loading the dataset into a pandas dataframe
twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

# Printing the shape of the dataframe
print('Number of rows', twitter_sentiment_df.shape[0])
print('Number of columns', twitter_sentiment_df.shape[1])

# Displaying the first 5 rows of the dataframe
twitter_sentiment_df.head()

Number of rows 1599999
Number of columns 6


Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [147]:
# Correcting the column names
columns = ['target', 'id', 'date', 'flag', 'user', 'text']

twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=columns)

print('Number of rows', twitter_sentiment_df.shape[0]) # 1600000 rows
print('Number of columns', twitter_sentiment_df.shape[1]) # 6 columns

twitter_sentiment_df.head()

Number of rows 1600000
Number of columns 6


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [169]:
# Renaming the 'text' column to 'tweet'
twitter_sentiment_df.rename(columns={'text': 'tweet'}, inplace=True)

In [170]:
# Converting the target field '4' to '1' for better understanding
twitter_sentiment_df['target'] = twitter_sentiment_df['target'].replace(4, 1)

In [172]:
# Dealing with null values
if twitter_sentiment_df.isnull().sum().sum() == 0:
    print('No null values found')

No null values found


In [173]:
# Checking the distribution of the target column
positive_count = 0
negative_count = 0

for value in twitter_sentiment_df['target']:
    if value == 0:
        negative_count += 1
    elif value == 1:
        positive_count += 1
    else:
        print("Error: Unexpected value in target column")

print('Number of positive tweets:', positive_count) # 800000 positive tweets
print('Number of negative tweets:', negative_count) # 800000 negative tweets

Number of positive tweets: 800000
Number of negative tweets: 800000


### Stemming the words
Stemming is the process of reducing a word to its root <br>
Example: **developing** is reduced to **develop**

In [175]:
english_stopwords = set(stopwords.words('english')) 

def stem_text(text):
    
    # Remove non-alphabetic characters and convert to lowercase
    stemmed_text = re.sub('[^A-Za-z]', ' ', text)
    stemmed_text = stemmed_text.lower() 
    
    # Tokenize the tweet
    stemmed_text = stemmed_text.split()
    
    # Stem each token, excluding stopwords
    stemmed_tokens = []
    for token in stemmed_text:
        if token not in english_stopwords:
            stemmed_token = PorterStemmer().stem(token)
            stemmed_tokens.append(stemmed_token)

    stemmed_text = stemmed_tokens
    
    # Join the stemmed words back into a single string
    stemmed_text = ' '.join(stemmed_text)

    return stemmed_text

In [177]:
# Applying the stem_text function to the text column
twitter_sentiment_df['stemmed_tweet'] = twitter_sentiment_df['tweet'].apply(stem_text) # This should take some time

In [184]:
twitter_sentiment_df[['tweet', 'stemmed_tweet']].head()

Unnamed: 0,tweet,stemmed_tweet
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


### Splitting the dataset into training and testing sets

In [225]:
X = twitter_sentiment_df['stemmed_tweet'].values
Y = twitter_sentiment_df['target'].values

X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, Y,
                                                    test_size = 0.2,    # 80% training, 20% testing
                                                    stratify = Y,       # Maintaining the distribution of the target column
                                                    random_state = 2    # For reproducibility
                                                    )

In [255]:
print('For X:')
print('Number of training samples:', X_train.shape[0], '(80% of the dataset)') # 1280000 samples for training 
print('Number of testing samples:', X_test.shape[0], '(20% of the dataset)') # 320000 samples for testing
print(' ')
print('For Y:')
print('Number of training samples:', Y_train.shape[0], '(80% of the dataset)') # 1280000 samples for training 
print('Number of testing samples:', Y_test.shape[0], '(20% of the dataset)') # 320000 samples for testing

For X:
Number of training samples: 1280000 (80% of the dataset)
Number of testing samples: 320000 (20% of the dataset)
 
For Y:
Number of training samples: 1280000 (80% of the dataset)
Number of testing samples: 320000 (20% of the dataset)


### Converting the textual data to numerical data

In [227]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [254]:
print('For X_train:')
print('Number of samples:', X_train.shape[0]) # 1280000 samples
print('Number of unique words:', X_train.shape[1]) # 461488 unique words
print(' ')
print('For X_test:')
print('Number of samples:', X_test.shape[0]) # 320000 samples
print('Number of unique words:', X_test.shape[1]) # 461488 unique words

For X_train:
Number of samples: 1280000
Number of unique words: 461488
 
For X_test:
Number of samples: 320000
Number of unique words: 461488


### Training the model

In [256]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

### Evaluating the model

In [260]:
X_train_predictions = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_predictions)

In [262]:
print('Accuracy score on training data: {:.2f}%'.format(training_accuracy * 100))

Training Accuracy: 79.87%


In [263]:
X_test_predictions = model.predict(X_test)
testing_accuracy = accuracy_score(Y_test, X_test_predictions)

In [264]:
print('Accuracy score on testing data: {:.2f}%'.format(testing_accuracy * 100))

Accuracy score on testing data: 77.67%


### Saving the model

In [267]:
file = 'model.pkl' # pkl -> pickle file
pickle.dump(model, open(file, 'wb')) # wb -> write in binary format

### Using the model

In [268]:
loaded = pickle.load(open('model.pkl', 'rb')) # rb -> read binary

In [287]:
X_new = X_test[7]
print('Expected prediction: ', Y_test[7])

prediction = loaded.predict(X_new).item()
print('Predicted value: ', prediction)

print(' ')

if prediction == Y_test[7]:
    print('The model is working as expected')
else:
    print('The model is not working as expected')

Expected prediction:  0
Predicted value:  0
 
The model is working as expected


In [288]:
X_new = X_test[200]
print('Expected prediction: ', Y_test[200])

prediction = loaded.predict(X_new).item()
print('Predicted value: ', prediction)

print(' ')

if prediction == Y_test[200]:
    print('The model is working as expected')
else:
    print('The model is not working as expected')

Expected prediction:  1
Predicted value:  1
 
The model is working as expected
