In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer



In [12]:
df = pd.read_csv('/Users/alizakhan/reddit_scraper/SentiStocks/modelTrainingData/final_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13970 entries, 0 to 13969
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13970 non-null  int64 
 1   Text        13970 non-null  object
 2   Sentiment   13970 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 327.6+ KB


In [27]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
# X = vectorizer.fit_transform(X)



In [28]:

X = df['Text']
y = df['Sentiment']
X = vectorizer.fit_transform(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [33]:
# Train an SVM model
svm_model = SVC(kernel='rbf', C=10, random_state=42)
svm_model.fit(X_train, y_train)


In [34]:
# Predict on the test data
y_pred = svm_model.predict(X_test)


In [35]:
# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8346456692913385
Confusion Matrix:
 [[1278  403]
 [ 290 2220]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.76      0.79      1681
           1       0.85      0.88      0.86      2510

    accuracy                           0.83      4191
   macro avg       0.83      0.82      0.83      4191
weighted avg       0.83      0.83      0.83      4191



In [32]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'rbf']}

# Perform grid search
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=5)
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.701 total time=   4.2s
[CV 2/5] END ..............C=0.1, kernel=linear;, score=0.696 total time=   4.1s
[CV 3/5] END ..............C=0.1, kernel=linear;, score=0.706 total time=   4.2s
[CV 4/5] END ..............C=0.1, kernel=linear;, score=0.697 total time=   4.1s
[CV 5/5] END ..............C=0.1, kernel=linear;, score=0.690 total time=   4.2s
[CV 1/5] END .................C=0.1, kernel=rbf;, score=0.602 total time=   4.6s
[CV 2/5] END .................C=0.1, kernel=rbf;, score=0.601 total time=   4.7s
[CV 3/5] END .................C=0.1, kernel=rbf;, score=0.605 total time=   4.7s
[CV 4/5] END .................C=0.1, kernel=rbf;, score=0.606 total time=   4.6s
[CV 5/5] END .................C=0.1, kernel=rbf;, score=0.597 total time=   4.6s
[CV 1/5] END ................C=1, kernel=linear;, score=0.818 total time=   3.2s
[CV 2/5] END ................C=1, kernel=linear;,