In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-analysis/data
/kaggle/input/sentiment-analysis-dataset/training.1600000.processed.noemoticon.csv
/kaggle/input/sentiment-analysis-dataset/train.csv
/kaggle/input/sentiment-analysis-dataset/testdata.manual.2009.06.14.csv
/kaggle/input/sentiment-analysis-dataset/test.csv


# **Importing the the Essential libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report

# Reading the input dataset

In [3]:
dataset = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', encoding='ISO-8859-1', quoting= csv.QUOTE_MINIMAL)
# encoding='ISO-8859-1' is used for reading files with special characters

In [4]:
print(dataset) #printing the dataset to see if there is any nan values

          textID                                               text sentiment  \
0     f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral   
1     96d74cb729   Shanghai is also really exciting (precisely -...  positive   
2     eee518ae67  Recession hit Veronique Branquinho, she has to...  negative   
3     01082688c6                                        happy bday!  positive   
4     33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive   
...          ...                                                ...       ...   
4810         NaN                                                NaN       NaN   
4811         NaN                                                NaN       NaN   
4812         NaN                                                NaN       NaN   
4813         NaN                                                NaN       NaN   
4814         NaN                                                NaN       NaN   

     Time of Tweet Age of U

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [5]:
df=pd.DataFrame(dataset) #creating a dataframe named 'df'
dataset=df.drop(['textID','Country'],axis=1) #Since column'textID' id not relevent we drop that column


In [6]:
print(dataset)

                                                   text sentiment  \
0     Last session of the day  http://twitpic.com/67ezh   neutral   
1      Shanghai is also really exciting (precisely -...  positive   
2     Recession hit Veronique Branquinho, she has to...  negative   
3                                           happy bday!  positive   
4                http://twitpic.com/4w75p - I like it!!  positive   
...                                                 ...       ...   
4810                                                NaN       NaN   
4811                                                NaN       NaN   
4812                                                NaN       NaN   
4813                                                NaN       NaN   
4814                                                NaN       NaN   

     Time of Tweet Age of User  Population -2020  Land Area (Km²)  \
0          morning        0-20        38928346.0         652860.0   
1             noon       21-30   

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


# Dropping all the rows containing missing values(nan values)

In [7]:
dataset=dataset.dropna(subset=['text','sentiment','Time of Tweet','Age of User','Population -2020']) #dropping all the rows containing missing values(nan values)
print(dataset) #printing the dataset to check if all the rows containing nan values are deleted

                                                   text sentiment  \
0     Last session of the day  http://twitpic.com/67ezh   neutral   
1      Shanghai is also really exciting (precisely -...  positive   
2     Recession hit Veronique Branquinho, she has to...  negative   
3                                           happy bday!  positive   
4                http://twitpic.com/4w75p - I like it!!  positive   
...                                                 ...       ...   
3529  its at 3 am, im very tired but i can`t sleep  ...  negative   
3530  All alone in this old house again.  Thanks for...  positive   
3531   I know what you mean. My little dog is sinkin...  negative   
3532  _sutra what is your next youtube video gonna b...  positive   
3533   http://twitpic.com/4woj2 - omgssh  ang cute n...  positive   

     Time of Tweet Age of User  Population -2020  Land Area (Km²)  \
0          morning        0-20        38928346.0         652860.0   
1             noon       21-30   

# Assigning different columns for X and y

In [8]:
X=dataset.iloc[:,[0,2,3,4,5,6]].values
y=dataset.iloc[:,1].values
print(X) # all the columns except 'sentiment' column is assigned in X
print(y)# 'sentiment' column is assigned in y

# later we use this X and y to train the model after splitting them into training set and test set

[['Last session of the day  http://twitpic.com/67ezh' 'morning' '0-20'
  38928346.0 652860.0 60.0]
 [' Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China:  (SH)  (BJ).'
  'noon' '21-30' 2877797.0 27400.0 105.0]
 ['Recession hit Veronique Branquinho, she has to quit her company, such a shame!'
  'night' '31-45' 43851044.0 2381740.0 18.0]
 ...
 [' I know what you mean. My little dog is sinking into depression... he wants to move someplace tropical'
  'morning' '46-60' 206139589.0 910770.0 226.0]
 ['_sutra what is your next youtube video gonna be about? I love your videos!'
  'noon' '60-70' 25778816.0 120410.0 214.0]
 [' http://twitpic.com/4woj2 - omgssh  ang cute ng bby.!' 'night'
  '70-100' 2083374.0 25220.0 83.0]]
['neutral' 'positive' 'negative' ... 'negative' 'positive' 'positive']


# Cleaning the texts

In [9]:
nltk.download('stopwords') # dowloading all the stopwords from the nltk module
from nltk.corpus import stopwords #importing all the stopwords from nltk.corpus
from nltk.stem.porter import PorterStemmer # importing PorterStemmer from nltk.stem.porter 
corpus=[] # creating the empty array called corpus

for i in range(0,3534):
    review = re.sub('[^a-zA-Z]',' ',dataset['text'][i]) #replacing all the characters which is not between a-z or A-Z with a whitespace
    review = review.lower() #converting ith text into lowercase
    review= review.split() # splits the ith text into words which are separated by whitespace
    ps=PorterStemmer() #PorterStemmer() helps in stemming the text[for example it converts 'ran','runs','running' into the base words i.e 'run']
    all_stopwords=stopwords.words('english') # store all ENGLISH stopwords in all_stopwords
    all_stopwords.remove('not') #remove the word 'not' from the list of stopwords
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)] #select the words which is not in all_stopwords
    review =' '.join(review) # join all the clean words separated by a space
    corpus.append(review) # append the array 'corpus'

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
 #corpus consists of all stopwords

# Creating the bag of words model

In [11]:
cv= CountVectorizer(max_features=4000) # bag of words(corpus) consists iof 4000 most frequently used words
vectorized =cv.fit_transform(corpus).toarray() # used to convert a text corpus into a numeric matrix using the Bag of Words model
X= np.concatenate((X,vectorized),axis=1) #join this vectorized matrix in the matrix of features X
df=pd.DataFrame(X)
X=df.iloc[:,1:] # drop the 'text' column since since we already have the numeric vectorized matrix
df=pd.DataFrame(X)

print(X[[1,2]])

            1       2
0     morning    0-20
1        noon   21-30
2       night   31-45
3     morning   46-60
4        noon   60-70
...       ...     ...
3529     noon   21-30
3530    night   31-45
3531  morning   46-60
3532     noon   60-70
3533    night  70-100

[3534 rows x 2 columns]


In [12]:
print(X)

         1       2            3          4      5    6    7    8    9    10    \
0     morning    0-20   38928346.0   652860.0   60.0    0    0    0    0    0   
1        noon   21-30    2877797.0    27400.0  105.0    0    0    0    0    0   
2       night   31-45   43851044.0  2381740.0   18.0    0    0    0    0    0   
3     morning   46-60      77265.0      470.0  164.0    0    0    0    0    0   
4        noon   60-70   32866272.0  1246700.0   26.0    0    0    0    0    0   
...       ...     ...          ...        ...    ...  ...  ...  ...  ...  ...   
3529     noon   21-30    6624554.0   120340.0   55.0    0    0    0    0    0   
3530    night   31-45   24206644.0  1266700.0   19.0    0    0    0    0    0   
3531  morning   46-60  206139589.0   910770.0  226.0    0    0    0    0    0   
3532     noon   60-70   25778816.0   120410.0  214.0    0    0    0    0    0   
3533    night  70-100    2083374.0    25220.0   83.0    0    0    0    0    0   

      ... 3996 3997 3998 39

# Label Encode first and second column

In [13]:
df=pd.DataFrame(X)
from sklearn.preprocessing import LabelEncoder
le =LabelEncoder()
X.iloc[:,0] = le.fit_transform(X.iloc[:,0])
X.iloc[:,1] = le.fit_transform(X.iloc[:,1])
y=le.fit_transform(y)
print(X)
print(y)

     1    2            3          4      5    6    7    8    9    10    ...  \
0       0    0   38928346.0   652860.0   60.0    0    0    0    0    0  ...   
1       2    1    2877797.0    27400.0  105.0    0    0    0    0    0  ...   
2       1    2   43851044.0  2381740.0   18.0    0    0    0    0    0  ...   
3       0    3      77265.0      470.0  164.0    0    0    0    0    0  ...   
4       2    4   32866272.0  1246700.0   26.0    0    0    0    0    0  ...   
...   ...  ...          ...        ...    ...  ...  ...  ...  ...  ...  ...   
3529    2    1    6624554.0   120340.0   55.0    0    0    0    0    0  ...   
3530    1    2   24206644.0  1266700.0   19.0    0    0    0    0    0  ...   
3531    0    3  206139589.0   910770.0  226.0    0    0    0    0    0  ...   
3532    2    4   25778816.0   120410.0  214.0    0    0    0    0    0  ...   
3533    1    5    2083374.0    25220.0   83.0    0    0    0    0    0  ...   

     3996 3997 3998 3999 4000 4001 4002 4003 4004 4

# Feature scale second third and fourth column

In [14]:
scaler = StandardScaler() 
X.iloc[:, [2, 3, 4]] = scaler.fit_transform(X.iloc[:, [2, 3, 4]])# feature scaling second, third and fourth column because they can dominate the column having small numeric values
print(X)


     1    2         3         4         5    6    7    8    9    10    ...  \
0       0    0  -0.00334 -0.010544  -0.14689    0    0    0    0    0  ...   
1       2    1 -0.248825 -0.350677  -0.12401    0    0    0    0    0  ...   
2       1    2   0.03018   0.92964 -0.168246    0    0    0    0    0  ...   
3       0    3 -0.267895 -0.365321 -0.094011    0    0    0    0    0  ...   
4       2    4  -0.04462  0.312392 -0.164178    0    0    0    0    0  ...   
...   ...  ...       ...       ...       ...  ...  ...  ...  ...  ...  ...   
3529    2    1 -0.223311 -0.300135 -0.149433    0    0    0    0    0  ...   
3530    1    2 -0.103587  0.323269 -0.167737    0    0    0    0    0  ...   
3531    0    3  1.135275   0.12971 -0.062486    0    0    0    0    0  ...   
3532    2    4 -0.092881 -0.300097 -0.068588    0    0    0    0    0  ...   
3533    1    5 -0.254234 -0.351862 -0.135196    0    0    0    0    0  ...   

     3996 3997 3998 3999 4000 4001 4002 4003 4004 4005  
0     

# Splitting the dataset into the Training set and Test set

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Applying Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=0.5,class_weight='balanced',max_iter=5000) # using class_weight='balanced' because it tells the model to pay more attention to minority classes
classifier.fit(X_train, y_train) # training the model using training set

In [17]:
y_pred = classifier.predict(X_test) # y_pred stores the predicted values
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) # the first column denotes the predicted results and the second column denotes the actual data from the test set.

[[2 2]
 [1 2]
 [2 2]
 ...
 [1 2]
 [0 0]
 [1 1]]


# Evaluating the Logistic Regression model

In [18]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred) # accuracy of 66.19% is achieved which is considered good for a classic ML model.

[[136  57  11]
 [ 52 180  50]
 [ 12  57 152]]


0.6619519094766619

In [19]:
print(classification_report(y_test, y_pred)) # prints precsion recall and f1-scores of all the classes.

              precision    recall  f1-score   support

           0       0.68      0.67      0.67       204
           1       0.61      0.64      0.62       282
           2       0.71      0.69      0.70       221

    accuracy                           0.66       707
   macro avg       0.67      0.66      0.67       707
weighted avg       0.66      0.66      0.66       707



**This model doesnot ignore any class(positive, negative or neutral) and achieves the accuracy of 66.19%. Please let me know if you get more scores than this model. I'll be happy to learn.**