In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [26]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [27]:
df = pd.read_csv("tweets_with_sentiment_1.csv")
print(df[['text', 'sentiment']].head())



                                                text sentiment
0  Blue Ridge Bank shares halted by NYSE after bi...  positive
1  😎 Today, that's this Thursday, we will do a "🎬...  positive
2  Guys evening, I have read this article about B...  positive
3  $BTC A big chance in a billion! Price: \487264...  positive
4  This network is secured by 9 508 nodes as of t...  positive


In [28]:
# Optional: If you want numerical labels
label_map = {'positive': 1, 'neutral': 0, 'negative': -1}
df['label'] = df['sentiment'].map(label_map)



In [29]:
# Replace NaN values with empty strings
X = X.fillna("")



In [30]:
X = X.dropna()
y = y[X.index]  # Keep corresponding labels in sync



In [31]:
print(df.columns)


Index(['date', 'text', 'hashtags', 'is_retweet', 'lang', 'sentiment', 'label'], dtype='object')


In [32]:
# Input and output
X = df['text'].fillna("")          # tweet text (input)
y = df['label']                    # sentiment label (output)


In [33]:
y = df['sentiment']

In [34]:
# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [35]:
# 3. Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [36]:
# 4. Train SVM classifier
svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train_tfidf, y_train)

In [37]:
# 5. Predict and evaluate
y_pred = svm_clf.predict(X_test_tfidf)


In [38]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9045

Classification Report:
               precision    recall  f1-score   support

    negative       0.87      0.71      0.78      1435
     neutral       0.88      0.97      0.92      4252
    positive       0.94      0.91      0.93      4313

    accuracy                           0.90     10000
   macro avg       0.90      0.86      0.88     10000
weighted avg       0.91      0.90      0.90     10000

