In [None]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import nltk
from pandarallel import pandarallel
import warnings
warnings.filterwarnings("ignore")


nltk.download('punkt')
pandarallel.initialize(progress_bar=True)
analyzer = SentimentIntensityAnalyzer()


train_data_full = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data = train_data_full.sample(frac=0.5, random_state=42).reset_index(drop=True)


if 'Text' not in train_data.columns:
    train_data['Text'] = ''
train_data['Text'] = train_data['Text'].fillna('').astype(str)


def extract_features(df, is_test=False, product_avg=None, user_avg=None):
    if 'Text' not in df.columns:
        df['Text'] = ''
    df['Text'] = df['Text'].fillna('').astype(str)


    def sentiment_scores(text):
        scores = analyzer.polarity_scores(text)
        return pd.Series({
            'Sentiment_Neg': scores['neg'],
            'Sentiment_Neu': scores['neu'],
            'Sentiment_Pos': scores['pos'],
            'Sentiment_Compound': scores['compound']
        })
    sentiment_df = df['Text'].parallel_apply(sentiment_scores)
    df = pd.concat([df.reset_index(drop=True), sentiment_df.reset_index(drop=True)], axis=1)


    df['TextLength'] = df['Text'].str.len()
    df['ExclamationCount'] = df['Text'].str.count("!")
    df['QuestionCount'] = df['Text'].str.count(r"\?")
    df['CapitalRatio'] = df['Text'].str.count(r'[A-Z]') / (df['TextLength'] + 1)
    df['UppercaseWords'] = df['Text'].str.count(r'\b[A-Z]{2,}\b')


    df['SentimentMapped'] = ((df['Sentiment_Compound'] + 1) * 2) + 1
    df['SentimentLevel'] = df['Sentiment_Compound'].apply(lambda x: 1 if x <= -0.6 else 2 if x <= -0.2 else 3 if x <= 0.2 else 4 if x <= 0.6 else 5)


    if 'HelpfulnessNumerator' in df.columns and 'HelpfulnessDenominator' in df.columns:
        df['HelpfulnessRatio'] = df['HelpfulnessNumerator'] / (df['HelpfulnessDenominator'] + 1)
    else:
        df['HelpfulnessRatio'] = 0.0


    df['Sentiment_Helpfulness'] = df['Sentiment_Compound'] * df['HelpfulnessRatio']
    df['Sentiment_TextLength'] = df['Sentiment_Compound'] * df['TextLength']


    if not is_test and 'Score' in df.columns:
        df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
        df_non_null = df.dropna(subset=['Score'])

        if 'ProductId' in df.columns:
            product_avg = df_non_null.groupby('ProductId')['Score'].mean().reset_index().rename(columns={'Score': 'ProductAvgScore'})
            df = df.merge(product_avg, on='ProductId', how='left')
        else:
            df['ProductAvgScore'] = df['Score'].mean()

        if 'UserId' in df.columns:
            user_avg = df_non_null.groupby('UserId')['Score'].mean().reset_index().rename(columns={'Score': 'UserAvgScore'})
            df = df.merge(user_avg, on='UserId', how='left')
        else:
            df['UserAvgScore'] = df['Score'].mean()
    else:
        if 'ProductId' in df.columns and product_avg is not None:
            df = df.merge(product_avg, on='ProductId', how='left')
        else:
            df['ProductAvgScore'] = train_data['Score'].mean()

        if 'UserId' in df.columns and user_avg is not None:
            df = df.merge(user_avg, on='UserId', how='left')
        else:
            df['UserAvgScore'] = train_data['Score'].mean()


    df.fillna({
        'SentimentMapped': 3, 'SentimentLevel': 3, 'HelpfulnessRatio': 0.0,
        'TextLength': 0, 'ExclamationCount': 0, 'QuestionCount': 0,
        'CapitalRatio': 0.0, 'UppercaseWords': 0, 'Sentiment_Helpfulness': 0.0,
        'Sentiment_TextLength': 0.0, 'ProductAvgScore': train_data['Score'].mean(),
        'UserAvgScore': train_data['Score'].mean(), 'Sentiment_Neg': 0.0,
        'Sentiment_Neu': 0.0, 'Sentiment_Pos': 0.0, 'Sentiment_Compound': 0.0
    }, inplace=True)

    return df


train_data = extract_features(train_data)
train_data = train_data.dropna(subset=['Score'])
train_data['Score'] = train_data['Score'].astype(int)


product_avg = train_data.groupby('ProductId')['Score'].mean().reset_index().rename(columns={'Score': 'ProductAvgScore'})
user_avg = train_data.groupby('UserId')['Score'].mean().reset_index().rename(columns={'Score': 'UserAvgScore'})


feature_columns = [
    'SentimentMapped', 'SentimentLevel', 'HelpfulnessRatio', 'TextLength',
    'ExclamationCount', 'QuestionCount', 'CapitalRatio', 'UppercaseWords',
    'Sentiment_Helpfulness', 'Sentiment_TextLength', 'ProductAvgScore',
    'UserAvgScore', 'Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos', 'Sentiment_Compound'
]
X_numeric = train_data[feature_columns]
y = train_data['Score']


tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_text_tfidf = tfidf_vectorizer.fit_transform(train_data['Text'])


X_numeric_scaled = StandardScaler().fit_transform(X_numeric)
X_combined = hstack([csr_matrix(X_numeric_scaled), X_text_tfidf])


param_grid = {'C': [0.1, 0.5, 1.0], 'solver': ['newton-cg', 'lbfgs'], 'max_iter': [1000, 2000]}
log_reg = LogisticRegression(multi_class='multinomial', n_jobs=-1)
grid_search = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)


grid_search.fit(X_combined, y)
best_params = grid_search.best_params_
log_reg_model = LogisticRegression(max_iter=best_params['max_iter'], solver=best_params['solver'], C=best_params['C'], multi_class='multinomial', n_jobs=-1)
log_reg_model.fit(X_combined, y)


cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(log_reg_model, X_combined, y, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation accuracy: {cv_scores.mean():.4f}")


train_data_full_unique = train_data_full.drop_duplicates(subset='Id')
test_data = test_data.merge(train_data_full_unique[['Id', 'Text', 'ProductId', 'UserId', 'HelpfulnessNumerator', 'HelpfulnessDenominator']], on='Id', how='left')
test_data['Text'] = test_data['Text'].fillna('')


test_data = extract_features(test_data, is_test=True, product_avg=product_avg, user_avg=user_avg)


X_test_numeric = test_data[feature_columns]
X_test_numeric_scaled = StandardScaler().fit_transform(X_test_numeric)
X_test_text_tfidf = tfidf_vectorizer.transform(test_data['Text'])
X_test_combined = hstack([csr_matrix(X_test_numeric_scaled), X_test_text_tfidf])


test_data['Score'] = log_reg_model.predict(X_test_combined)
test_data['Score'] = test_data['Score'].round().clip(1, 5).astype(int)


test_data[['Id', 'Score']].to_csv('submission.csv', index=False)
print("Score prediction complete; file saved as 'submission.csv'.")
