In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load datasets
apps = pd.read_csv('data/apps_data.csv')
reviews = pd.read_csv('data/user_reviews.csv')

# Quick look at data
print("Apps data shape:", apps.shape)
print("Reviews data shape:", reviews.shape)
print(apps.head())
print(reviews.head())

# Merge reviews with apps on 'App' column
data = pd.merge(reviews, apps[['App', 'Category']], on='App', how='left')

# Drop rows with missing reviews
data = data.dropna(subset=['Translated_Review'])

# Sentiment analysis function using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0.1:
        return 'Positive'
    elif analysis.sentiment.polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment
data['Sentiment'] = data['Translated_Review'].apply(get_sentiment)

# Show sentiment counts
print(data['Sentiment'].value_counts())

# Plot sentiment distribution
plt.figure(figsize=(6,4))
sns.countplot(data=data, x='Sentiment', order=['Positive','Neutral','Negative'])
plt.title('Sentiment Distribution of Reviews')
plt.show()

# WordClouds for each sentiment
sentiments = ['Positive', 'Neutral', 'Negative']
for sentiment in sentiments:
    text = ' '.join(data[data['Sentiment']==sentiment]['Translated_Review'].dropna())
    wordcloud = WordCloud(width=600, height=300, background_color='white').generate(text)
    plt.figure(figsize=(8,4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {sentiment} Reviews')
    plt.show()

# Prepare data for modeling (Binary classification: Positive vs Negative)
df_model = data[data['Sentiment'] != 'Neutral']
X = df_model['Translated_Review']
y = df_model['Sentiment'].apply(lambda x: 1 if x=='Positive' else 0)

# Vectorize text
cv = CountVectorizer(stop_words='english', max_features=5000)
X_vec = cv.fit_transform(X)

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Done! You can save the model or explore further.
