In [9]:
import pandas as pd
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load dataset
file_path = 'the final dataset.xlsx'  # Update to match the exact file path
excel_data = pd.ExcelFile(file_path)

# Load attributes data from "Sheet1" and reviews from "Sheet2"
sheet1_data = excel_data.parse('Sheet1')
sheet2_data = excel_data.parse('Sheet2')

# Set up VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Run sentiment analysis on each review in Sheet2
def get_sentiment_label(text):
    scores = analyzer.polarity_scores(str(text))
    compound = scores['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

sheet2_data['Sentiment'] = sheet2_data['Review'].apply(get_sentiment_label)

# Encode sentiment labels into numerical values
label_encoder = LabelEncoder()
sheet2_data['Sentiment_Label'] = label_encoder.fit_transform(sheet2_data['Sentiment'])

# Merge Sheet1 (attributes) with Sheet2 (reviews) based on 'Phone Name'
merged_data = pd.merge(sheet1_data, sheet2_data, on='Phone Name', how='inner')

# Prepare feature matrix X and target variable y
X = merged_data[sheet1_data.columns[1:]]  # Exclude 'Phone Name', use remaining columns from Sheet1 as features
y = merged_data['Sentiment_Label']

# Convert categorical features in X to one-hot encoding
X_encoded = pd.get_dummies(X)

# Perform Chi-Square test and select top K features
k_best_features = 10  # Select the top 10 features, adjust as needed
chi2_selector = SelectKBest(score_func=chi2, k=k_best_features)
X_kbest = chi2_selector.fit_transform(X_encoded, y)

# Get selected feature names
selected_features = X_encoded.columns[chi2_selector.get_support(indices=True)]

# Display selected features and their scores
feature_scores = pd.DataFrame({
    'Feature': selected_features,
    'Chi2_Score': chi2_selector.scores_[chi2_selector.get_support()]
})
print("Selected Features based on Chi-Square Test:\n", feature_scores.sort_values(by='Chi2_Score', ascending=False))


ModuleNotFoundError: No module named 'vaderSentiment'