In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
try:
    movies_df = pd.read_csv('theatre_cleaned.csv')
    display(movies_df.head())
    print(movies_df.shape)
except FileNotFoundError:
    print("Error: 'theatre_cleaned.csv' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Unnamed: 0,city,theatre_name,average_ticket_price,total_seats,no_screens,type,lat,lon,source_of_information
0,Ahmedabad,"AB Miniplex: Shivranjini Cross Road, Satellite",125.619048,302,3,1,23.02494,72.52938,BookMyShow
1,Ahmedabad,Amber Cinema: Ahmedabad,100.833333,763,1,0,23.03431,72.62002,BookMyShow
2,Ahmedabad,Anupam Cinema: Ahmedabad,125.833333,781,1,0,23.00656,72.61333,BookMyShow
3,Ahmedabad,"Apsara Cinema, Behrampura",149.094915,1117,1,0,23.003974,72.597468,MediaAnt
4,Ahmedabad,"Aradhana Cinema, Behrampura",149.094915,455,1,0,23.00396,72.59752,MediaAnt


(578, 9)


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Handle missing values (example: fill with mean)
for col in movies_df.select_dtypes(include=['number']):
    movies_df[col] = movies_df[col].fillna(movies_df[col].mean())

# Handle categorical missing values (example: fill with mode)
for col in movies_df.select_dtypes(include=['object']):
    movies_df[col] = movies_df[col].fillna(movies_df[col].mode()[0])


# Example: Create a 'profit_margin' feature
movies_df['profit_margin'] = movies_df['average_ticket_price'] * movies_df['total_seats']  # This is a simple example, you might need a more complex calculation


In [9]:
le = LabelEncoder()
for col in ['city', 'type', 'source_of_information']:  # Add other categorical columns
    movies_df[col] = le.fit_transform(movies_df[col])

In [10]:
# Define features (X) and target (y)
X = movies_df[['city', 'type', 'average_ticket_price', 'profit_margin']]
y = movies_df['total_seats']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Create a Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters

In [12]:
# Train the model
rf_model.fit(X_train, y_train)


In [14]:
# predict values
y_pred=rf_model.predict(X_test)

In [15]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")



Mean Squared Error: 8053.135663793103
R-squared: 0.9488541738265875


In [16]:
import pickle

# Assuming 'rf_model' is your trained Random Forest model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)