### Project 1: Movie Rating Data Prep
#### Objective: Clean a movie rating dataset by handling missing ratings and ensuring movie titles are consistent.

In [1]:
import pandas as pd

##### Step 1: Create two sample DataFrames

In [2]:
# Movie ratings dataset (has missing ratings and inconsistent titles)
ratings_df = pd.DataFrame({
    'movie_title': [' Avatar ', 'Titanic', 'inception ', 'AVATAR', 'Titanic '],
    'rating': [4.5, None, 5.0, 4.0, None],
    'user_id': [101, 102, 103, 104, 105]
})

# Movie info dataset (clean movie titles)
movies_info = pd.DataFrame({
    'movie_title': ['avatar', 'titanic', 'inception'],
    'genre': ['Sci-Fi', 'Romance', 'Thriller'],
    'year': [2009, 1997, 2010]
})

print("Original Ratings Data:")
print(ratings_df)

Original Ratings Data:
  movie_title  rating  user_id
0     Avatar      4.5      101
1     Titanic     NaN      102
2  inception      5.0      103
3      AVATAR     4.0      104
4    Titanic      NaN      105


##### Step 2: Clean and Standardize Movie Titles

In [3]:
# Convert all movie titles to lowercase and remove extra spaces
ratings_df['movie_title'] = ratings_df['movie_title'].str.lower().str.strip() # strip()will remove space and make the text proper

print("\nAfter Standardizing Movie Titles:")
print(ratings_df)


After Standardizing Movie Titles:
  movie_title  rating  user_id
0      avatar     4.5      101
1     titanic     NaN      102
2   inception     5.0      103
3      avatar     4.0      104
4     titanic     NaN      105


##### Step 3: Handle Missing Ratings

In [4]:
# Fill missing ratings with the average (mean) rating
avg_rating = ratings_df['rating'].mean()
ratings_df.fillna({'rating': avg_rating}, inplace=True)

print("\nAfter Filling Missing Ratings with Average:")
print(ratings_df)


After Filling Missing Ratings with Average:
  movie_title  rating  user_id
0      avatar     4.5      101
1     titanic     4.5      102
2   inception     5.0      103
3      avatar     4.0      104
4     titanic     4.5      105


##### Step 4: Merge with Movie Info Dataset

In [6]:
# Merge cleaned ratings with movie info (on movie_title)
merged_df = pd.merge(ratings_df, movies_info, on='movie_title', how='left') 

print("\nFinal Cleaned and Merged Movie Ratings Data:")
print(merged_df)

# how = left is a left join, ratings_df is left table and movies_info is the right table it will join from left table


Final Cleaned and Merged Movie Ratings Data:
  movie_title  rating  user_id     genre  year
0      avatar     4.5      101    Sci-Fi  2009
1     titanic     4.5      102   Romance  1997
2   inception     5.0      103  Thriller  2010
3      avatar     4.0      104    Sci-Fi  2009
4     titanic     4.5      105   Romance  1997
