# Midterm Progress

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load datasets
movies_data_df = pd.read_csv('./data/updated_movies_data.csv', encoding='utf-8')
smile_2_tweets = pd.read_csv('./data/smile-2-tweets.csv', encoding='utf-8')

# Smile 2 attributes
smile_2_budget = 28000000  # Smile 2 budget
smile_2_genres = {"Horror", "Thriller"}

# Sentiment Analysis for Smile 2 Tweets
def compute_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

smile_2_tweets['Sentiment'] = smile_2_tweets['Content'].apply(compute_sentiment)

# Display sentiment for the first 100 tweets
print("Sentiment for the first 100 tweets:")
for i, row in smile_2_tweets.head(100).iterrows():
    print(f"Tweet: {row['Content']}")
    print(f"Sentiment: {row['Sentiment']}")

# Calculate the overall average sentiment for Smile 2 tweets
avg_sentiment_smile2 = smile_2_tweets['Sentiment'].mean()
print(f"\nAverage Sentiment for Smile 2 Tweets: {avg_sentiment_smile2}")

# Assign sentiment based on IMDb ratings
def rating_to_sentiment(rating):
    if rating >= 7.5:
        return 1.0  # Strong positive sentiment for high-rated movies
    elif 5.5 <= rating < 7.5:
        return 0.5  # Neutral sentiment for mid-rated movies
    elif rating < 5.5:
        return 0.2  # Low sentiment for poorly rated movies
    return 0.0  # Default sentiment for missing ratings

movies_data_df['Derived_Sentiment'] = movies_data_df['Rating'].apply(rating_to_sentiment)

# Calculate genre similarity
def genre_similarity(movie_genres, target_genres):
    if isinstance(movie_genres, str):
        movie_genres_set = set(map(str.strip, movie_genres.split(',')))
        common_genres = movie_genres_set.intersection(target_genres)
        return len(common_genres) / len(target_genres)
    return 0

# Add genre similarity feature and filter for budget range
movies_data_df['Genre_Similarity'] = movies_data_df['Genre'].apply(
    lambda x: genre_similarity(x, smile_2_genres)
)
budget_min, budget_max = smile_2_budget * 0.5, smile_2_budget * 1.5
similar_movies = movies_data_df[
    (movies_data_df['Budget'].astype(float) >= budget_min) &
    (movies_data_df['Budget'].astype(float) <= budget_max) &
    (movies_data_df['Genre_Similarity'] > 0)
]

# Display a few similar movies with their sentiment scores
print("\nSample of similar movies with their assigned sentiment scores:")
for i, row in similar_movies.head(10).iterrows():
    print(f"Title: {row['Title']}, Derived Sentiment: {row['Derived_Sentiment']}, Genre Similarity: {row['Genre_Similarity']}")

# Add sentiment features and create a sentiment-genre interaction term
similar_movies['Avg_Sentiment'] = avg_sentiment_smile2
similar_movies['Sentiment_Genre_Interaction'] = similar_movies['Derived_Sentiment'] * similar_movies['Genre_Similarity']

# Scale features to amplify sentiment impact
scaler = StandardScaler()
scaled_features = scaler.fit_transform(
    similar_movies[['Budget', 'Genre_Similarity', 'Avg_Sentiment', 'Derived_Sentiment', 'Sentiment_Genre_Interaction']]
)

# Define features and target variable for prediction
features = pd.DataFrame(scaled_features, columns=['Budget', 'Genre_Similarity', 'Avg_Sentiment', 'Derived_Sentiment', 'Sentiment_Genre_Interaction'])
target = similar_movies['Lifetime_Revenue'].replace('[\$,]', '', regex=True).astype(float)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train a Gradient Boosting Regressor model to handle feature importance
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"\nR-Squared Score: {r2}")

# Predict Lifetime Box Office for Smile 2 using sentiment and interaction
smile_2_features = scaler.transform([[smile_2_budget, 1, avg_sentiment_smile2, avg_sentiment_smile2, avg_sentiment_smile2]])
smile_2_predicted_box_office = model.predict(smile_2_features)
print(f"Predicted Lifetime Box Office Revenue for Smile 2: ${smile_2_predicted_box_office[0]:,.2f}")


Sentiment for the first 100 tweets:
Tweet: Smile 2 Is Even Better than the First!!! #Smile2#SmileMovie
Sentiment: 0.42942708333333335
Tweet: Once again I am afraid of smiles. Check out my review of #Smile2 and see why I think it's better than the first#smilemoviehttps://youtube.com/watch?v=cemAqlSj-_0…
Sentiment: -0.04999999999999999
Tweet: Is #Smile2 the new Final Destination franchise?
Sentiment: 0.06818181818181818
Tweet: #Smile2 feels like a prominent turning point for Naomi Scott, much like a smile creeping across one’s face. Our review: https://flickreel.com/smile-2-review/ 
Sentiment: 0.3333333333333333
Tweet: #Smile2 feels like a prominent turning point for Naomi Scott, much like a smile creeping across one’s face. My review: https://flickreel.com/smile-2-review/ 
Sentiment: 0.3333333333333333
Tweet: How yaw feel about smile2 #Smile2
Sentiment: 0.0
Tweet: #Smile2 was Go see this movie, way better than the first one #FrightNightWithFerociousFi #HorrorPodcast #SmileMovie
Sentimen

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_movies['Avg_Sentiment'] = avg_sentiment_smile2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_movies['Sentiment_Genre_Interaction'] = similar_movies['Derived_Sentiment'] * similar_movies['Genre_Similarity']
