In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

# Load the data
df = pd.read_csv('merged_data.csv')

# Data cleaning function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Clean review text
df['cleaned_review'] = df['review_title'].apply(clean_text)

# Sentiment analysis function using VADER
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return 'positive' if scores['compound'] >= 0.05 else ('negative' if scores['compound'] <= -0.05 else 'neutral')

# Apply sentiment analysis to get sentiment labels
df['sentiment'] = df['cleaned_review'].apply(get_sentiment)

# Encode sentiment labels
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# Feature Engineering: Extract relevant features
df['review_length'] = df['cleaned_review'].apply(len)
df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
df['helpful_vote'] = df['helpful_vote'].fillna(0)
df['verified_purchase'] = df['verified_purchase'].astype(int)

# Define the target variable for churn (e.g., leaving a negative review)
df['churn'] = df['sentiment'].apply(lambda x: 1 if x == 'negative' else 0)

# Prepare feature matrix and target vector
features = ['review_length', 'word_count', 'helpful_vote', 'verified_purchase', 'price']
X = df[features]
y = df['churn']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'ROC AUC: {roc_auc}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

# Predict churn for all customers
df['churn_prediction'] = model.predict(X)
df['churn_probability'] = model.predict_proba(X)[:, 1]

# Save the dataset with churn predictions
df.to_csv('churn_predictions.csv', index=False)

# Display the first few rows to verify the results
df[['user_id', 'sentiment', 'churn', 'churn_prediction', 'churn_probability']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Accuracy: 0.5783358440975672
ROC AUC: 0.6090468002834579
Confusion Matrix:
[[82061 42902]
 [62342 62287]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.66      0.61    124963
           1       0.59      0.50      0.54    124629

    accuracy                           0.58    249592
   macro avg       0.58      0.58      0.58    249592
weighted avg       0.58      0.58      0.58    249592



Unnamed: 0,user_id,sentiment,churn,churn_prediction,churn_probability
0,AGTJAM6NFSYTR5KB6DQNHZ3ZIFFQ,positive,0,0,0.444783
1,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,0,1,0.530193
2,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,0,0,0.447847
3,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,0,1,0.669571
4,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,0,1,0.710663


In [4]:
display(df.columns)

Index(['rating', 'review_title', 'parent_asin', 'user_id', 'date',
       'helpful_vote', 'verified_purchase', 'main_category', 'item_title',
       'price', 'cleaned_review', 'sentiment', 'sentiment_encoded',
       'review_length', 'word_count', 'churn', 'churn_prediction',
       'churn_probability'],
      dtype='object')

In [5]:
#remove no longer needed columns
df = df.drop(columns=['rating', 'review_title','helpful_vote', 'verified_purchase', 'main_category', 'item_title',
       'price'])

# Save the dataset with churn predictions
df.to_csv('churn_predictions.csv', index=False)

# Display the first few rows to verify the results
df[['user_id', 'sentiment', 'churn', 'churn_prediction', 'churn_probability']].head()

Unnamed: 0,user_id,sentiment,churn,churn_prediction,churn_probability
0,AGTJAM6NFSYTR5KB6DQNHZ3ZIFFQ,positive,0,0,0.444783
1,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,0,1,0.530193
2,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,0,0,0.447847
3,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,0,1,0.669571
4,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,0,1,0.710663


In [1]:
display(df.columns)

NameError: name 'df' is not defined