In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier


In [8]:
df = pd.read_csv('../data/Covid-19 Twitter Dataset (Apr-Jun 2020).csv')

#### Split the dataset into features (X) and target (y)

In [9]:
X = df['clean_tweet']
y = df['sentiment']

#### Split the data into training and testing sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Create a TF-IDF vectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)

In [12]:
print(type(X_train))

<class 'pandas.core.series.Series'>


#### Fit and transform the training data

In [13]:
# Fill missing values in the 'clean_tweet' column with an empty string
X_train = X_train.fillna('')

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

#### Transform the testing data

In [14]:
# Fill missing values in the 'clean_tweet' column of the test data with an empty string
X_test = X_test.fillna('')

# Transform the testing data using the same TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)


### Create and train a Logistic Regression model


In [15]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Make predictions

In [16]:
y_pred = logistic_regression.predict(X_test_tfidf)

 ### Evaluate the model

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.96
              precision    recall  f1-score   support

         neg       0.98      0.93      0.95      8023
         neu       0.95      0.99      0.97     11354
         pos       0.97      0.96      0.96      9404

    accuracy                           0.96     28781
   macro avg       0.96      0.96      0.96     28781
weighted avg       0.96      0.96      0.96     28781



### Decision Trees Algorithm

#### Create and train a Decision Tree model

In [18]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_tfidf, y_train)

### Make predictions

In [19]:
y_pred_tree = decision_tree.predict(X_test_tfidf)

###  Evaluate the model

In [20]:
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f"Accuracy (Decision Tree): {accuracy_tree:.2f}")
print(classification_report(y_test, y_pred_tree))

Accuracy (Decision Tree): 0.95


              precision    recall  f1-score   support

         neg       0.93      0.91      0.92      8023
         neu       0.99      0.99      0.99     11354
         pos       0.93      0.94      0.93      9404

    accuracy                           0.95     28781
   macro avg       0.95      0.95      0.95     28781
weighted avg       0.95      0.95      0.95     28781



### Max Depth Analysis

In [21]:
max_depths = [5, 10, 15, 20, None]  # Experiment with different maximum depths
for max_depth in max_depths:
    decision_tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    decision_tree.fit(X_train_tfidf, y_train)
    y_pred_tree = decision_tree.predict(X_test_tfidf)
    accuracy_tree = accuracy_score(y_test, y_pred_tree)
    print(f"Accuracy (Decision Tree, Max Depth={max_depth}): {accuracy_tree:.2f}")


Accuracy (Decision Tree, Max Depth=5): 0.53
Accuracy (Decision Tree, Max Depth=10): 0.59
Accuracy (Decision Tree, Max Depth=15): 0.64
Accuracy (Decision Tree, Max Depth=20): 0.66
Accuracy (Decision Tree, Max Depth=None): 0.95


### Support Vector Machines

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [23]:
X = X.fillna('')

#### Replace NaN values with empty strings

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Create a TF-IDF vectorizer

In [25]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)

#### Fit and transform the training data

In [26]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

#### Transform the testing data using the same TF-IDF vectorizer

In [27]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#### Create and train a Linear SVM model

In [28]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

#### Make predictions

In [29]:
y_pred_svm = svm_model.predict(X_test_tfidf)

#### Evaluate the model

In [30]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy (SVM): {accuracy_svm:.2f}")
print(classification_report(y_test, y_pred_svm))
      

Accuracy (SVM): 0.98
              precision    recall  f1-score   support

         neg       0.98      0.95      0.96      8023
         neu       0.98      0.99      0.99     11354
         pos       0.97      0.97      0.97      9404

    accuracy                           0.98     28781
   macro avg       0.98      0.97      0.97     28781
weighted avg       0.98      0.98      0.98     28781

