In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score


In [2]:

# Load the dataset
df = pd.read_csv('../Preprocesssing/pre-processed.csv')


In [3]:

# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer(use_idf=True)


In [4]:

# Compute TF-IDF values for the 'tweet' column
tfidf_vectors = vectorizer.fit_transform(df['tweets'])


In [7]:

# Convert this matrix into a DataFrame
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=vectorizer.get_feature_names_out())


In [8]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, df['ClassLabel'], test_size=0.2, random_state=42)


In [9]:

# Initialize a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)


In [10]:

# Use cross-validation to evaluate the classifier
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')


In [11]:

print(f'Cross-validation scores: {scores}')
print(f'Average cross-validation score: {scores.mean()}')


Cross-validation scores: [0.8779319  0.88297604 0.88398487 0.88549811 0.87991927]
Average cross-validation score: 0.882062038803201


In [12]:

# Fit the classifier to the training data
clf.fit(X_train, y_train)


In [13]:

# Make predictions on the testing data
predictions = clf.predict(X_test)

In [14]:

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, predictions)

In [15]:

print(f'Test accuracy: {accuracy}')

Test accuracy: 0.886624974783135


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.59      0.10      0.17       287
           1       0.90      0.97      0.93      3851
           2       0.84      0.75      0.80       819

    accuracy                           0.89      4957
   macro avg       0.78      0.61      0.63      4957
weighted avg       0.87      0.89      0.87      4957

