In [1]:
!pip install pandas scikit-learn joblib



In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [3]:
file_path = '/content/all-data.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [4]:
data.columns = ['sentiment', 'news_text']

In [5]:
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4845 entries, 0 to 4844
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  4845 non-null   object
 1   news_text  4845 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


Unnamed: 0,sentiment,news_text
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [6]:
print("Missing values:", data.isnull().sum())

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['news_text'] = data['news_text'].apply(preprocess_text)
data.head()

Missing values: sentiment    0
news_text    0
dtype: int64


Unnamed: 0,sentiment,news_text
0,neutral,technopolis plans to develop in stages an area...
1,negative,the international electronic industry company ...
2,positive,with the new production plant the company woul...
3,positive,according to the company s updated strategy fo...
4,positive,financing of aspocomp s growth aspocomp is agg...


In [7]:
X = data['news_text']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

initial_report = classification_report(y_test, y_pred)
print("Initial Model Evaluation Report:\n", initial_report)

Initial Model Evaluation Report:
               precision    recall  f1-score   support

    negative       0.90      0.46      0.61       115
     neutral       0.74      0.93      0.82       567
    positive       0.77      0.52      0.62       287

    accuracy                           0.75       969
   macro avg       0.80      0.64      0.69       969
weighted avg       0.77      0.75      0.74       969



In [9]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear']
}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=3, scoring='f1_macro')

grid_search.fit(X_train_tfidf, y_train)

best_model = grid_search.best_estimator_

y_pred_tuned = best_model.predict(X_test_tfidf)

tuned_report = classification_report(y_test, y_pred_tuned)
print("Fine-tuned Model Evaluation Report:\n", tuned_report)

Fine-tuned Model Evaluation Report:
               precision    recall  f1-score   support

    negative       0.77      0.55      0.64       115
     neutral       0.77      0.88      0.82       567
    positive       0.74      0.61      0.67       287

    accuracy                           0.76       969
   macro avg       0.76      0.68      0.71       969
weighted avg       0.76      0.76      0.75       969



In [10]:
model_filename = '/content/sentiment_model.pkl'
vectorizer_filename = '/content/tfidf_vectorizer.pkl'

joblib.dump(best_model, model_filename)
joblib.dump(vectorizer, vectorizer_filename)

print(f"Model saved to {model_filename}")
print(f"Vectorizer saved to {vectorizer_filename}")

Model saved to /content/sentiment_model.pkl
Vectorizer saved to /content/tfidf_vectorizer.pkl
