# Opinion Positivity and Training models with ML Algorithoms

### 1. Gathering Data

In [None]:
import pandas as pd


# Path to your CSV file
csv_file = '../../data/test_sample.csv'
csv_file_bow = '../../data/bow_features.csv'
# Read CSV into a DataFrame
df = pd.read_csv(csv_file)

df_bow= pd.read_csv(csv_file_bow)
df

### 2. Opinion Positivity Testing

In [None]:
from textblob import TextBlob
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    return 'positive' if polarity > 0 else 'negative' if polarity < 0 else 'neutral'

df['label'] = df['text'].apply(get_sentiment)
df['label']
df

### 3. Change data to a real sentiment one

In [23]:
df.to_csv('../../data/test_sample.csv')

### 4. Connect BOW + Logisctic Regression

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer  # better for big data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Suppose your data is in df with columns: 'text' and 'label'
# df = your large dataframe

# For demo, here is just a small sample (replace with your actual df)
# df = pd.read_csv('your_large_dataset.csv')

# 1. Encode labels
le = LabelEncoder()
y = le.fit_transform(df['label'])

# 2. Vectorize text data efficiently
vectorizer = TfidfVectorizer(
    max_features=100000,
    ngram_range=(1,2),
    max_df=1.0,  # no upper pruning
    min_df=1     # no lower pruning
)
X = vectorizer.fit_transform(df['clean_text'])


# 3. Split into train/test (e.g., 90/10 split for large dataset)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

# 4. Initialize logistic regression with solver that handles large sparse data well
model = LogisticRegression(
    max_iter=1000,
    solver='saga',        # good for large sparse datasets
    n_jobs=-1,            # use all CPUs
    verbose=1             # print progress
)

# 5. Train the model
model.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))
