In [1]:
# Movie Rating Prediction
### Data Science Project

In [None]:
## 1. Import libraries

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
## 2. Load dataset

In [20]:
data = pd.read_csv("movie.csv", encoding="Latin1")

In [None]:
## 3. Fill missing values

In [21]:
# Numeric
data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')  # convert invalid to NaN
data['Votes'] = pd.to_numeric(data['Votes'], errors='coerce')

# Fill NaN with mean
data['Rating'] = data['Rating'].fillna(data['Rating'].mean())
data['Votes'] = data['Votes'].fillna(data['Votes'].mean())

# Categorical (for completeness)
data['Genre'] = data['Genre'].fillna("Unknown")
data['Director'] = data['Director'].fillna("Unknown")
data['Actor 1'] = data['Actor 1'].fillna("Unknown")
data['Actor 2'] = data['Actor 2'].fillna("Unknown")
data['Actor 3'] = data['Actor 3'].fillna("Unknown")

In [None]:
## 4. Clean numeric columns

In [22]:
# Year
data['Year'] = data['Year'].str.replace('(', '', regex=False).str.replace(')', '', regex=False)
data['Year'] = pd.to_numeric(data['Year'], errors='coerce').fillna(0)

# Duration
data['Duration'] = data['Duration'].str.replace(' min', '', regex=False)
data['Duration'] = pd.to_numeric(data['Duration'], errors='coerce').fillna(0)


In [27]:
## 5. Prepare features and target

In [23]:
X = data[['Year', 'Duration', 'Votes']]  # numeric features only
y = data['Rating']                        # target

# Force numeric type (prevents TypeError)
X = X.apply(pd.to_numeric)
y = pd.to_numeric(y)

In [None]:
## 6. Train-test split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
## 7. Train model

In [25]:
model = LinearRegression()
model.fit(X_train, y_train)
print("Model trained successfully ✅")

Model trained successfully ✅


In [None]:
## 8. Predict & evaluate

In [26]:
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# Optional: show sample predictions
result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(result.head(10))

Mean Squared Error: 0.9621086569123414
R2 Score: 0.0003830911213361654
         Actual  Predicted
11115  5.841621   5.852684
2921   6.800000   5.935251
3463   5.841621   5.782993
2495   5.841621   5.782705
15263  5.841621   5.780695
11165  4.600000   5.929981
8371   4.800000   5.931084
6929   5.300000   5.806570
4246   6.900000   5.980452
3312   5.841621   5.782705
