# Preprocessing

In [18]:
# Import necessary libraries
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yasmine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yasmine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yasmine\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('twitter_training.csv')
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [3]:
df.columns = ['Tweet ID', 'Entity','Sentiment','Tweet content']

In [4]:
df

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
df.isnull().sum()

Tweet ID           0
Entity             0
Sentiment          0
Tweet content    686
dtype: int64

In [6]:
df.dropna(inplace=True)
df.isnull().sum()

Tweet ID         0
Entity           0
Sentiment        0
Tweet content    0
dtype: int64

In [7]:
df.duplicated().sum()

2340

In [8]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [9]:
# Function to clean and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    text.lower()
    re.sub(r'<.*?>', '', text)
    re.sub(r'http\S+|www\S+', '', text)
    re.sub(r'\d+', '', text)
    text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Apply the preprocessing function to the 'answer' column
df['Tweet content'] = df['Tweet content'].apply(preprocess_text)

# Display the the DataFrame to verify the cleaning
df

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet content
0,2401,Borderlands,Positive,"[coming, borders, kill]"
1,2401,Borderlands,Positive,"[im, getting, borderlands, kill]"
2,2401,Borderlands,Positive,"[im, coming, borderlands, murder]"
3,2401,Borderlands,Positive,"[im, getting, borderlands, 2, murder]"
4,2401,Borderlands,Positive,"[im, getting, borderlands, murder]"
...,...,...,...,...
74676,9200,Nvidia,Positive,"[realized, windows, partition, mac, like, 6, y..."
74677,9200,Nvidia,Positive,"[realized, mac, window, partition, 6, years, b..."
74678,9200,Nvidia,Positive,"[realized, windows, partition, mac, 6, years, ..."
74679,9200,Nvidia,Positive,"[realized, windows, partition, mac, like, 6, y..."


# Word Encoding

In [10]:
# Train Word2Vec model
word2vec_model = Word2Vec(df['Tweet content'], vector_size=100, window=5, min_count=1, sg=0)

# Example: Get the vector representation of a word
vector = word2vec_model.wv['window']

# Display the vector
print(f"Vector representation of 'window':\n{vector}")

Vector representation of 'window':
[-0.00943755  0.26099944 -0.03922266 -0.01492248  0.16864118 -0.07922755
  0.16793358  0.2302979   0.03463559 -0.15469164 -0.00123698 -0.17844383
  0.07346623 -0.018391    0.09865313 -0.22230211  0.16230637 -0.01300347
 -0.0756865  -0.11329022 -0.03875731 -0.15416846  0.08303196 -0.13134858
 -0.18429036 -0.20975675 -0.179855   -0.15282418 -0.01220256  0.11156218
 -0.00969717  0.00231799 -0.15227231 -0.06982458 -0.00325035  0.11097591
 -0.04369216  0.072916    0.1174055  -0.05178224  0.08291655 -0.08431213
  0.20255294  0.11600514  0.0940742  -0.09549088 -0.25435957 -0.01027803
  0.14617617  0.04083905  0.26633683 -0.05455856  0.13962556 -0.13510214
 -0.07551055  0.07156575  0.15867619 -0.0293057  -0.22990164  0.05975836
  0.06646955  0.08239646  0.00916726  0.10386416  0.1105414  -0.00270574
 -0.00972172  0.2609701   0.00963032 -0.09694183 -0.01970674 -0.01898998
  0.14368978 -0.01805769  0.11083362  0.11160186 -0.10591445  0.02540655
  0.03504689  0.

In [11]:
# Function to encode text using Word2Vec model
def encode_text(text, model):
    encoded_text = []
    for word in text:
        if word in model.wv:
            encoded_text.append(model.wv[word])
        else:
            encoded_text.append([0] * model.vector_size)  # Use zeros for out-of-vocabulary words
    
    if encoded_text:
        return np.mean(encoded_text, axis=0)  # Average the word vectors to get a single vector for the text
    else:
        return np.zeros(model.vector_size)  # Return zeros if all words are out-of-vocabulary

# Apply encoding to the 'cleaned_answer' column
df['encoded_Tweet content'] = df['Tweet content'].apply(lambda x: encode_text(x, word2vec_model))

In [12]:
df

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet content,encoded_Tweet content
0,2401,Borderlands,Positive,"[coming, borders, kill]","[0.04346965, 0.45610085, 0.081298344, 0.021134..."
1,2401,Borderlands,Positive,"[im, getting, borderlands, kill]","[-0.059645176, 0.58141416, -0.83193904, 0.3132..."
2,2401,Borderlands,Positive,"[im, coming, borderlands, murder]","[0.0724921, 0.8801872, -0.676708, 0.23314987, ..."
3,2401,Borderlands,Positive,"[im, getting, borderlands, 2, murder]","[0.19256392, 0.8078066, -0.52544177, -0.019594..."
4,2401,Borderlands,Positive,"[im, getting, borderlands, murder]","[0.0066717044, 0.6428493, -0.778016, 0.2610697..."
...,...,...,...,...,...
74676,9200,Nvidia,Positive,"[realized, windows, partition, mac, like, 6, y...","[0.025013788, 0.41522276, 0.067986384, -0.0940..."
74677,9200,Nvidia,Positive,"[realized, mac, window, partition, 6, years, b...","[0.021817235, 0.414009, 0.044669747, -0.134880..."
74678,9200,Nvidia,Positive,"[realized, windows, partition, mac, 6, years, ...","[-0.05365038, 0.41012084, -0.0028680812, -0.17..."
74679,9200,Nvidia,Positive,"[realized, windows, partition, mac, like, 6, y...","[0.08807629, 0.46296683, -0.04194421, -0.07029..."


# Models training

In [13]:
# Split the data into features (X) and target (y)
X = np.array(df['encoded_Tweet content'].tolist())
y = df['Sentiment']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Support Vector Machine (SVM)

In [15]:
# Initialize SVM classifier
svm_clf = SVC(kernel='linear', random_state=42)

# Train the SVM classifier
svm_clf.fit(X_train, y_train)

# Predict on the test data
y_pred_svm = svm_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.5238294606098667
              precision    recall  f1-score   support

  Irrelevant       0.46      0.13      0.20      2455
    Negative       0.57      0.69      0.63      4433
     Neutral       0.47      0.54      0.50      3532
    Positive       0.53      0.56      0.55      3911

    accuracy                           0.52     14331
   macro avg       0.51      0.48      0.47     14331
weighted avg       0.52      0.52      0.50     14331



### Naive Bayes

In [20]:
# Define the Naive Bayes model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Predict on the test data
y_pred_nb = nb_model.predict(X_test)

# Calculate accuracy
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy: {accuracy_nb}")

# Print classification report
print(classification_report(y_test, y_pred_nb))

Accuracy: 0.4258600237247924
              precision    recall  f1-score   support

  Irrelevant       0.24      0.43      0.31      2455
    Negative       0.58      0.48      0.53      4433
     Neutral       0.45      0.41      0.43      3532
    Positive       0.49      0.38      0.43      3911

    accuracy                           0.43     14331
   macro avg       0.44      0.42      0.42     14331
weighted avg       0.46      0.43      0.44     14331



### Logistic Regression

In [21]:
# Train Linear Regression model
lr_model = LogisticRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test data
y_pred_lr = lr_model.predict(X_test)

# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy_lr}")

# Print classification report
print(classification_report(y_test, y_pred_lr))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.5178284837066499
              precision    recall  f1-score   support

  Irrelevant       0.45      0.13      0.20      2455
    Negative       0.57      0.69      0.62      4433
     Neutral       0.46      0.53      0.49      3532
    Positive       0.52      0.57      0.54      3911

    accuracy                           0.52     14331
   macro avg       0.50      0.48      0.46     14331
weighted avg       0.51      0.52      0.49     14331



### Ada Boosting  Algorithms

In [22]:
# Train Linear Regression model
ad_model = AdaBoostClassifier()

# Train the model
ad_model.fit(X_train, y_train)

# Predict on test data
y_pred_ad = ad_model.predict(X_test)

# Calculate accuracy
accuracy_ad = accuracy_score(y_test, y_pred_ad)
print(f"Accuracy: {accuracy_nb}")

# Print classification report
print(classification_report(y_test, y_pred_ad))



Accuracy: 0.4258600237247924
              precision    recall  f1-score   support

  Irrelevant       0.36      0.14      0.21      2455
    Negative       0.56      0.65      0.60      4433
     Neutral       0.45      0.49      0.47      3532
    Positive       0.49      0.55      0.52      3911

    accuracy                           0.50     14331
   macro avg       0.47      0.46      0.45     14331
weighted avg       0.48      0.50      0.48     14331



# Interpretation of the Obtained Results

<h2>SVM (Support Vector Machine)</h2>
    <p><strong>Accuracy:</strong> 52.38%</p>
    <p><strong>Interpretation:</strong></p>
    <ul>
        <li>SVM performs better than Naive Bayes and AdaBoost, but still has limited success.</li>
        <li>The model is good at predicting the "Negative" sentiment but struggles with "Irrelevant" and "Neutral" categories.</li>
        <li><strong>Precision:</strong> Highest for "Negative" sentiment, lowest for "Irrelevant".</li>
        <li><strong>Recall:</strong> Highest for "Negative" and "Positive" sentiments, lowest for "Irrelevant".</li>
        <li><strong>F1-score:</strong> Best for "Negative", moderate for "Positive", and lower for "Irrelevant" and "Neutral".</li>
    </ul>

<h2>Naive Bayes</h2>
    <p><strong>Accuracy:</strong> 42.59%</p>
    <p><strong>Interpretation:</strong></p>
    <ul>
        <li>Naive Bayes has the lowest accuracy among all models.</li>
        <li>The model struggles particularly with "Irrelevant" and "Positive" sentiments.</li>
        <li><strong>Precision:</strong> Lowest for "Irrelevant", highest for "Negative".</li>
        <li><strong>Recall:</strong> Best for "Negative", but lower for "Irrelevant" and "Positive".</li>
        <li><strong>F1-score:</strong> Generally lower across all classes compared to other models.</li>
    </ul>

<h2>Logistic Regression</h2>
    <p><strong>Accuracy:</strong> 51.78%</p>
    <p><strong>Interpretation:</strong></p>
    <ul>
        <li>Logistic Regression performs similarly to SVM.</li>
        <li>It shows good performance for "Negative" sentiment but struggles with "Irrelevant" and "Neutral" categories.</li>
        <li><strong>Precision:</strong> Highest for "Negative", lowest for "Irrelevant".</li>
        <li><strong>Recall:</strong> Highest for "Negative" and "Positive", lowest for "Irrelevant".</li>
        <li><strong>F1-score:</strong> Best for "Negative", moderate for "Positive", and lower for "Irrelevant" and "Neutral".</li>
    </ul>

<h2>AdaBoost</h2>
    <p><strong>Accuracy:</strong> 42.59%</p>
    <p><strong>Interpretation:</strong></p>
    <ul>
        <li>AdaBoost has the same accuracy as Naive Bayes, but slightly better F1-scores for all classes.</li>
        <li>It performs relatively well for "Negative" and "Positive" sentiments but struggles with "Irrelevant" and "Neutral".</li>
        <li><strong>Precision:</strong> Lowest for "Irrelevant", highest for "Negative".</li>
        <li><strong>Recall:</strong> Best for "Negative", but lower for "Irrelevant" and "Neutral".</li>
        <li><strong>F1-score:</strong> Moderate for "Negative", "Positive", and "Neutral", but lower for "Irrelevant".</li>
    </ul>

<h3>Summary and Choice of Best Model</h3>

<p>Evaluating the Models: Here's a summary of the performance metrics for each model:</p>

<h2>Summary of Model Performance Metrics</h2>
    <table border="1" cellpadding="5" cellspacing="0">
        <thead>
            <tr>
                <th>Model</th>
                <th>Accuracy</th>
                <th>Precision (Weighted Avg)</th>
                <th>Recall (Weighted Avg)</th>
                <th>F1-score (Weighted Avg)</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>SVM</td>
                <td>52.38%</td>
                <td>52%</td>
                <td>52%</td>
                <td>50%</td>
            </tr>
            <tr>
                <td>Naive Bayes</td>
                <td>42.59%</td>
                <td>42%</td>
                <td>43%</td>
                <td>42%</td>
            </tr>
            <tr>
                <td>Logistic Regression</td>
                <td>51.78%</td>
                <td>51%</td>
                <td>52%</td>
                <td>49%</td>
            </tr>
            <tr>
                <td>AdaBoost</td>
                <td>42.59%</td>
                <td>48%</td>
                <td>50%</td>
                <td>48%</td>
            </tr>
        </tbody>
    </table>

<h2>Choosing the Best Model</h2>
    <p>Based on the performance metrics:</p>
    <ul>
        <li><strong>Accuracy:</strong> SVM and Logistic Regression perform similarly, with SVM having a slight edge.</li>
        <li><strong>Precision:</strong> SVM, Logistic Regression, and Naive Bayes have similar precision scores, with SVM being slightly better.</li>
        <li><strong>Recall:</strong> SVM and Logistic Regression have the highest recall, indicating their ability to correctly identify instances of each class.</li>
        <li><strong>F1-score:</strong> SVM and Logistic Regression have higher F1-scores compared to Naive Bayes and AdaBoost, indicating a balance between precision and recall.</li>
    </ul>

<h2>Conclusion</h2>
    <p>Considering all these factors, SVM seems to be the preferred choice at this stage:</p>
    <ul>
        <li>It has the highest accuracy among the models.</li>
        <li>It shows balanced precision and recall across different sentiment classes.</li>
        <li>It has the highest F1-score, indicating a good balance between precision and recall.</li>
    </ul>
    <p>While Logistic Regression performs similarly, SVM edges it out slightly in terms of accuracy and F1-score. Therefore, SVM would be the recommended model for sentiment analysis in this scenario.</p>