# Import Libraries

In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning

## 1. Read Data

In [98]:
df = pd.read_csv('unclean_data.csv', encoding='latin1')
df.head(3)

Unnamed: 0,movie_title,num_critic_for_reviews,duration,DIRECTOR_facebook_likes,actor_3_facebook_likes,ACTOR_1_facebook_likes,gross,num_voted_users,Cast_Total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,ACTOR_2_facebook_likes,imdb_score,title_year.1
0,Avatar?ÿ,723,178.0,10,855,1000,760505847,886204.0,4834.0,,3054,237000000,2009,936.0,7.9,2009.0
1,Pirates of the Caribbean: At World's End?ÿ,302,,563,1000,40000,309404152,471220.0,48350.0,,1238,300000000,2007,5000.0,7.1,
2,Spectre?ÿ,602,148.0,20,161,11000,200074175,275868.0,11700.0,1.0,994,245000000,2015,393.0,6.8,2015.0


## 2. Understanding the Data 

In [99]:
df.describe()

Unnamed: 0,num_critic_for_reviews,duration,actor_3_facebook_likes,ACTOR_1_facebook_likes,gross,num_voted_users,Cast_Total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,ACTOR_2_facebook_likes,imdb_score,title_year.1
count,14.0,11.0,14.0,14.0,14.0,13.0,12.0,9.0,14.0,14.0,14.0,13.0,14.0,7.0
mean,506.142857,150.727273,5866.142857,18206.428571,333583600.0,462157.8,44773.583333,2.111111,1620.071429,246264300.0,2010.5,8455.846154,7.171429,2011.142857
std,169.069789,21.679903,8289.592695,13905.214361,172067400.0,268705.9,37290.813335,1.269296,866.672102,24268230.0,3.632122,8541.019541,0.673028,3.976119
min,302.0,106.0,161.0,451.0,73058680.0,212204.0,1873.0,1.0,387.0,200000000.0,2006.0,393.0,6.1,2007.0
25%,379.25,141.0,611.25,3500.0,200257400.0,294810.0,9983.5,1.0,1024.75,239000000.0,2007.25,632.0,6.725,2008.0
50%,448.0,151.0,1000.0,21000.0,319826600.0,383056.0,47202.5,2.0,1240.5,250000000.0,2009.5,5000.0,7.2,2009.0
75%,635.0,162.5,8500.0,26000.0,441856100.0,471220.0,67064.75,3.0,2250.75,256000000.0,2014.25,11000.0,7.5,2015.0
max,813.0,183.0,23000.0,40000.0,760505800.0,1144337.0,106759.0,4.0,3054.0,300000000.0,2016.0,23000.0,8.5,2016.0


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   movie_title                14 non-null     object 
 1   num_critic_for_reviews     14 non-null     int64  
 2   duration                   11 non-null     float64
 3   DIRECTOR_facebook_likes    12 non-null     object 
 4   actor_3_facebook_likes     14 non-null     int64  
 5   ACTOR_1_facebook_likes     14 non-null     int64  
 6   gross                      14 non-null     int64  
 7   num_voted_users            13 non-null     float64
 8   Cast_Total_facebook_likes  12 non-null     float64
 9   facenumber_in_poster       9 non-null      float64
 10  num_user_for_reviews       14 non-null     int64  
 11  budget                     14 non-null     int64  
 12  title_year                 14 non-null     int64  
 13  ACTOR_2_facebook_likes     13 non-null     float64
 

In [101]:
df.isnull().sum().sort_values(ascending=False)

title_year.1                 7
facenumber_in_poster         5
duration                     3
DIRECTOR_facebook_likes      2
Cast_Total_facebook_likes    2
num_voted_users              1
ACTOR_2_facebook_likes       1
movie_title                  0
num_critic_for_reviews       0
actor_3_facebook_likes       0
ACTOR_1_facebook_likes       0
gross                        0
num_user_for_reviews         0
budget                       0
title_year                   0
imdb_score                   0
dtype: int64

## 3. Remove Duplicates

In [102]:
duplicates = df[df.duplicated()]
duplicates

Unnamed: 0,movie_title,num_critic_for_reviews,duration,DIRECTOR_facebook_likes,actor_3_facebook_likes,ACTOR_1_facebook_likes,gross,num_voted_users,Cast_Total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,ACTOR_2_facebook_likes,imdb_score,title_year.1


## 4. Handle Missing Values

In [103]:
def percentage_missing_values(data_frame: pd.DataFrame, columns=False):
    if columns:
        if isinstance(columns, str):
            print((data_frame[columns].isnull().sum()*100/len(data_frame)).round(2))
        if isinstance(columns, list):
            print((data_frame[columns].isnull().sum()*100/len(data_frame)).round(2).sort_values(ascending=False))
    else:
        print((data_frame.isnull().sum()*100/len(data_frame)).round(2).sort_values(ascending=False))

In [104]:
def correlation(data_frame: pd.DataFrame, column=False):
    if column:
        if isinstance(column, str):
            print(data_frame[[x for x in data_frame.columns if data_frame[x].dtype != 'object']].corr()[column])
    else:
        print(data_frame[[x for x in data_frame.columns if data_frame[x].dtype != 'object']].corr())

In [105]:
percentage_missing_values(df)

title_year.1                 50.00
facenumber_in_poster         35.71
duration                     21.43
DIRECTOR_facebook_likes      14.29
Cast_Total_facebook_likes    14.29
num_voted_users               7.14
ACTOR_2_facebook_likes        7.14
movie_title                   0.00
num_critic_for_reviews        0.00
actor_3_facebook_likes        0.00
ACTOR_1_facebook_likes        0.00
gross                         0.00
num_user_for_reviews          0.00
budget                        0.00
title_year                    0.00
imdb_score                    0.00
dtype: float64


In [106]:
cleaned_missing_data_df = df.copy()

### Title_Years.1 (50 %)

During the data analysis process, we identified a high correlation of 0.986321 between the columns title_year and title_year.1. This strong correlation indicates that the values in title_year.1 are highly dependent on title_year, making it a suitable candidate for filling in missing values.

To ensure data completeness and maintain consistency, we decided to replace the missing values in title_year.1 with the corresponding values from title_year. This approach helps preserve the integrity of the dataset while minimizing the impact of missing data on further analysis.

By applying this method, we ensure that our dataset remains as accurate and reliable as possible for subsequent modeling and decision-making processes.

In [107]:
cleaned_missing_data_df[['title_year.1', 'title_year']]

Unnamed: 0,title_year.1,title_year
0,2009.0,2009
1,,2007
2,2015.0,2015
3,,2012
4,,2012
5,2007.0,2007
6,,2010
7,,2015
8,2015.0,2015
9,,2009


In [108]:
correlation(cleaned_missing_data_df, 'title_year.1')

num_critic_for_reviews       0.680129
duration                     0.276204
actor_3_facebook_likes       0.376287
ACTOR_1_facebook_likes      -0.038468
gross                       -0.156177
num_voted_users             -0.249049
Cast_Total_facebook_likes    0.207619
facenumber_in_poster         0.408248
num_user_for_reviews        -0.062648
budget                       0.447124
title_year                   0.986321
ACTOR_2_facebook_likes       0.326709
imdb_score                   0.152546
title_year.1                 1.000000
Name: title_year.1, dtype: float64


In [109]:
cleaned_missing_data_df['title_year.1'] = cleaned_missing_data_df['title_year.1'].fillna(
    cleaned_missing_data_df['title_year']
).astype(int)

In [110]:
cleaned_missing_data_df['title_year.1']

0     2009
1     2007
2     2015
3     2012
4     2012
5     2007
6     2010
7     2015
8     2015
9     2009
10    2016
11    2006
12    2008
13    2008
Name: title_year.1, dtype: int32

### ACTOR_2_facebook_likes

During data preprocessing, we identified one missing value in the ACTOR_2_facebook_likes column. Given that this is a single missing entry, we determined that removing the corresponding row would have a negligible impact on further analysis.

Eliminating this row ensures data consistency while maintaining the overall quality of the dataset. Since the missing value represents an insignificant portion of the data, this decision minimizes potential biases and ensures a cleaner dataset for accurate insights and modeling.

In [111]:
cleaned_missing_data_df['ACTOR_2_facebook_likes']

0       936.0
1      5000.0
2       393.0
3     23000.0
4       632.0
5     11000.0
6       553.0
7     21000.0
8     21000.0
9     11000.0
10        NaN
11    10000.0
12      412.0
13     5000.0
Name: ACTOR_2_facebook_likes, dtype: float64

In [112]:
cleaned_missing_data_df = cleaned_missing_data_df.dropna(subset=['ACTOR_2_facebook_likes'])
cleaned_missing_data_df['ACTOR_2_facebook_likes'] = cleaned_missing_data_df['ACTOR_2_facebook_likes'].astype(int)
cleaned_missing_data_df['ACTOR_2_facebook_likes']

0       936
1      5000
2       393
3     23000
4       632
5     11000
6       553
7     21000
8     21000
9     11000
11    10000
12      412
13     5000
Name: ACTOR_2_facebook_likes, dtype: int32

### Cast_Total_facebook_likes

Our analysis revealed a strong correlation between Actor_3_Facebook_Likes (0.963516) and Cast_Total_facebook_likes, as well as between Actor_2_Facebook_Likes (0.913854) and Cast_Total_facebook_likes. Given these high correlation values, we decided to use linear regression to fill in the missing values for Actor_3_Facebook_Likes and Actor_2_Facebook_Likes.

By leveraging this approach, we can estimate the missing values based on the relationship between individual actor likes and the total cast likes, ensuring a more accurate and statistically sound dataset. This method helps retain valuable information and avoids unnecessary data loss, improving the overall quality of our analysis.

In [113]:
percentage_missing_values(cleaned_missing_data_df, 'Cast_Total_facebook_likes')

15.38


In [114]:
correlation(cleaned_missing_data_df, 'Cast_Total_facebook_likes')

num_critic_for_reviews       0.328225
duration                     0.101756
actor_3_facebook_likes       0.911231
ACTOR_1_facebook_likes       0.696016
gross                        0.334434
num_voted_users              0.422842
Cast_Total_facebook_likes    1.000000
facenumber_in_poster         0.985620
num_user_for_reviews         0.101120
budget                       0.250497
title_year                   0.312125
ACTOR_2_facebook_likes       0.963516
imdb_score                   0.572906
title_year.1                 0.335215
Name: Cast_Total_facebook_likes, dtype: float64


In [115]:
from sklearn.linear_model import LinearRegression

In [116]:
known = cleaned_missing_data_df[cleaned_missing_data_df['Cast_Total_facebook_likes'].notna()]
unknown = cleaned_missing_data_df[cleaned_missing_data_df['Cast_Total_facebook_likes'].isna()]

X_train = known[['actor_3_facebook_likes', 'ACTOR_2_facebook_likes']]
Y_train = known['Cast_Total_facebook_likes'].astype(int)

model = LinearRegression()
model.fit(X_train, Y_train)

X_test = unknown[['actor_3_facebook_likes', 'ACTOR_2_facebook_likes']]
predicted = model.predict(X_test)

predicted = np.round(predicted).astype(int)
predicted = np.clip(predicted, 0, None)

cleaned_missing_data_df.loc[unknown.index, 'Cast_Total_facebook_likes'] = predicted

In [117]:
cleaned_missing_data_df['Cast_Total_facebook_likes'] = cleaned_missing_data_df['Cast_Total_facebook_likes'].astype(int)
cleaned_missing_data_df['Cast_Total_facebook_likes']

0       4834
1      48350
2      11700
3     106759
4       1873
5      46055
6       9720
7      92000
8      92000
9      58753
11     60885
12      2023
13     48486
Name: Cast_Total_facebook_likes, dtype: int32

### DIRECTOR_facebook_likes

During data preprocessing, we identified several issues in the DIRECTOR_facebook_likes column that required cleaning to ensure consistency and accuracy:

1. String Representation of a Number:

The value "475" was enclosed in quotation marks, indicating an incorrect data type.
We converted it from a string to a numerical value (475) to maintain uniformity in the dataset.

2. Missing Value Handling:
        
We detected one missing value (NaN) in the column.
Given that it was a single missing entry, we decided to remove the corresponding row to preserve data integrity without significantly affecting the analysis.

3. Data Type Conversion:

After addressing the inconsistencies, we converted the entire column to integer type (int) to ensure numerical consistency for further analysis.

These steps help maintain data accuracy, prevent potential errors in calculations, and improve the reliability of insights derived from our dataset.

In [118]:
cleaned_missing_data_df['DIRECTOR_facebook_likes']

0        10
1       563
2        20
3     22000
4     "475"
5        23
6        15
7        10
8        10
9       282
11      NaN
12      395
13      563
Name: DIRECTOR_facebook_likes, dtype: object

In [119]:
cleaned_missing_data_df['DIRECTOR_facebook_likes'] = cleaned_missing_data_df['DIRECTOR_facebook_likes'].astype(str).str.replace('"', '', regex=False)

In [120]:
cleaned_missing_data_df['DIRECTOR_facebook_likes'] = cleaned_missing_data_df['DIRECTOR_facebook_likes'].replace('nan', np.nan)

In [121]:
cleaned_missing_data_df = cleaned_missing_data_df.dropna(subset=['DIRECTOR_facebook_likes'])

In [122]:
cleaned_missing_data_df['DIRECTOR_facebook_likes'] = cleaned_missing_data_df['DIRECTOR_facebook_likes'].astype(int)

In [123]:
cleaned_missing_data_df['DIRECTOR_facebook_likes']

0        10
1       563
2        20
3     22000
4       475
5        23
6        15
7        10
8        10
9       282
12      395
13      563
Name: DIRECTOR_facebook_likes, dtype: int32

### facenumber_in_poster

Filling Missing Values in facenumber_in_poster Using Linear Regression

Our analysis identified strong correlations between facenumber_in_poster and several other variables:

* Actor_2_Facebook_Likes (0.990246)
* Cast_Total_Facebook_Likes (0.970131)
* Actor_3_Facebook_Likes (0.986836)

Given these high correlation values, we decided to use linear regression to estimate and fill in the missing values for facenumber_in_poster. This method allows us to predict the missing entries based on their strong relationship with other features in the dataset, ensuring consistency and preserving valuable information.

By applying this approach, we improve data completeness while maintaining the statistical integrity of our dataset for further analysis and modeling.

In [124]:
cleaned_missing_data_df['facenumber_in_poster']

0     NaN
1     NaN
2     1.0
3     NaN
4     1.0
5     NaN
6     1.0
7     4.0
8     4.0
9     3.0
12    1.0
13    2.0
Name: facenumber_in_poster, dtype: float64

In [125]:
percentage_missing_values(cleaned_missing_data_df, 'facenumber_in_poster')

33.33


In [126]:
correlation(cleaned_missing_data_df, 'facenumber_in_poster')

num_critic_for_reviews       0.503866
duration                     0.398199
DIRECTOR_facebook_likes     -0.360133
actor_3_facebook_likes       0.970131
ACTOR_1_facebook_likes       0.697967
gross                        0.862914
num_voted_users              0.676375
Cast_Total_facebook_likes    0.986836
facenumber_in_poster         1.000000
num_user_for_reviews         0.261351
budget                       0.193735
title_year                   0.379868
ACTOR_2_facebook_likes       0.990246
imdb_score                   0.560600
title_year.1                 0.416378
Name: facenumber_in_poster, dtype: float64


In [127]:
known = cleaned_missing_data_df[cleaned_missing_data_df['facenumber_in_poster'].notna()]
unknown = cleaned_missing_data_df[cleaned_missing_data_df['facenumber_in_poster'].isna()]

X_train = known[['ACTOR_2_facebook_likes', 'actor_3_facebook_likes', 'Cast_Total_facebook_likes']]
Y_train = known['facenumber_in_poster'].astype(int)

model = LinearRegression()
model.fit(X_train, Y_train)

X_test = unknown[['ACTOR_2_facebook_likes', 'actor_3_facebook_likes', 'Cast_Total_facebook_likes']]
predicted = model.predict(X_test)

predicted = np.round(predicted).astype(int)
predicted = np.clip(predicted, 0, None)

cleaned_missing_data_df.loc[unknown.index, 'facenumber_in_poster'] = predicted

In [128]:
cleaned_missing_data_df['facenumber_in_poster'] = cleaned_missing_data_df['facenumber_in_poster'].astype(int)
cleaned_missing_data_df['facenumber_in_poster']

0     1
1     2
2     1
3     5
4     1
5     2
6     1
7     4
8     4
9     3
12    1
13    2
Name: facenumber_in_poster, dtype: int32

### duration

During our analysis, we observed that duration does not have a strong correlation with other columns, making it unsuitable for prediction using regression techniques. Additionally, we identified three missing values in this column.

To handle these missing values, we decided to fill them using the mean duration value. This method ensures that the overall distribution of the data remains unaffected while maintaining consistency across the dataset.

By using the mean, we effectively retain all records without introducing significant bias, ensuring a balanced approach for further analysis and modeling.

In [129]:
cleaned_missing_data_df['duration']

0     178.0
1       NaN
2     148.0
3       NaN
4     132.0
5     156.0
6       NaN
7     141.0
8     141.0
9     153.0
12    106.0
13    151.0
Name: duration, dtype: float64

In [130]:
correlation(cleaned_missing_data_df, 'duration')

num_critic_for_reviews       0.332430
duration                     1.000000
DIRECTOR_facebook_likes     -0.433861
actor_3_facebook_likes      -0.017452
ACTOR_1_facebook_likes       0.232577
gross                        0.717757
num_voted_users              0.654178
Cast_Total_facebook_likes    0.101756
facenumber_in_poster         0.039848
num_user_for_reviews         0.669421
budget                       0.421333
title_year                  -0.126230
ACTOR_2_facebook_likes       0.053141
imdb_score                   0.448822
title_year.1                -0.113918
Name: duration, dtype: float64


In [131]:
cleaned_missing_data_df['duration'].fillna(cleaned_missing_data_df['duration'].mean(), inplace=True)
cleaned_missing_data_df['duration'] = cleaned_missing_data_df['duration'].astype(int)
cleaned_missing_data_df['duration']

0     178
1     145
2     148
3     145
4     132
5     156
6     145
7     141
8     141
9     153
12    106
13    151
Name: duration, dtype: int32

In [132]:
percentage_missing_values(cleaned_missing_data_df)

movie_title                  0.0
num_critic_for_reviews       0.0
duration                     0.0
DIRECTOR_facebook_likes      0.0
actor_3_facebook_likes       0.0
ACTOR_1_facebook_likes       0.0
gross                        0.0
num_voted_users              0.0
Cast_Total_facebook_likes    0.0
facenumber_in_poster         0.0
num_user_for_reviews         0.0
budget                       0.0
title_year                   0.0
ACTOR_2_facebook_likes       0.0
imdb_score                   0.0
title_year.1                 0.0
dtype: float64


## 5. Standardize Data Formats

In [133]:
standardize_data_df = cleaned_missing_data_df.copy()

In [134]:
standardize_data_df.head(3)

Unnamed: 0,movie_title,num_critic_for_reviews,duration,DIRECTOR_facebook_likes,actor_3_facebook_likes,ACTOR_1_facebook_likes,gross,num_voted_users,Cast_Total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,ACTOR_2_facebook_likes,imdb_score,title_year.1
0,Avatar?ÿ,723,178,10,855,1000,760505847,886204.0,4834,1,3054,237000000,2009,936,7.9,2009
1,Pirates of the Caribbean: At World's End?ÿ,302,145,563,1000,40000,309404152,471220.0,48350,2,1238,300000000,2007,5000,7.1,2007
2,Spectre?ÿ,602,148,20,161,11000,200074175,275868.0,11700,1,994,245000000,2015,393,6.8,2015


In [135]:
standardize_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, 0 to 13
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   movie_title                12 non-null     object 
 1   num_critic_for_reviews     12 non-null     int64  
 2   duration                   12 non-null     int32  
 3   DIRECTOR_facebook_likes    12 non-null     int32  
 4   actor_3_facebook_likes     12 non-null     int64  
 5   ACTOR_1_facebook_likes     12 non-null     int64  
 6   gross                      12 non-null     int64  
 7   num_voted_users            12 non-null     float64
 8   Cast_Total_facebook_likes  12 non-null     int32  
 9   facenumber_in_poster       12 non-null     int32  
 10  num_user_for_reviews       12 non-null     int64  
 11  budget                     12 non-null     int64  
 12  title_year                 12 non-null     int64  
 13  ACTOR_2_facebook_likes     12 non-null     int32  
 14  i

### num_voted_users

During data preprocessing, we identified that the num_voted_users column is stored as a floating-point (float) type, even though it represents a count-based variable. Since vote counts should always be whole numbers, we decided to convert the column to an integer (int) type.

This conversion ensures consistency in data representation, optimizes storage, and prevents potential issues in calculations or visualizations. By maintaining the correct data type, we improve the accuracy and reliability of our dataset for further analysis.

In [136]:
standardize_data_df['num_voted_users'] = standardize_data_df['num_voted_users'].astype(int)
standardize_data_df['num_voted_users']

0      886204
1      471220
2      275868
3     1144337
4      212204
5      383056
6      294810
7      462669
8      462669
9      321795
12     330784
13     522040
Name: num_voted_users, dtype: int32

### movie_title

During data preprocessing, we identified that all entries in the movie_title column contain two unwanted symbols (?ÿ) at the end. These characters are likely encoding artifacts and do not belong to the actual movie titles.

To ensure data consistency and improve readability, we decided to remove the last two symbols from each movie title. This step helps maintain clean and properly formatted text, preventing potential issues in further analysis, visualizations, or machine learning models.

By applying this cleaning process, we ensure that the movie_title column accurately represents the correct movie names without extraneous characters.

In [137]:
standardize_data_df['movie_title']

0                                         Avatar?ÿ
1       Pirates of the Caribbean: At World's End?ÿ
2                                        Spectre?ÿ
3                          The Dark Knight Rises?ÿ
4                                    John Carter?ÿ
5                                   Spider-Man 3?ÿ
6                                        Tangled?ÿ
7                        Avengers: Age of Ultron?ÿ
8                        Avengers: Age of Ultron?ÿ
9         Harry Potter and the Half-Blood Prince?ÿ
12                             Quantum of Solace?ÿ
13    Pirates of the Caribbean: Dead Man's Chest?ÿ
Name: movie_title, dtype: object

In [138]:
standardize_data_df['movie_title'] = standardize_data_df['movie_title'].str[:-2]
standardize_data_df['movie_title']

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
5                                   Spider-Man 3
6                                        Tangled
7                        Avengers: Age of Ultron
8                        Avengers: Age of Ultron
9         Harry Potter and the Half-Blood Prince
12                             Quantum of Solace
13    Pirates of the Caribbean: Dead Man's Chest
Name: movie_title, dtype: object

## 6. Fix Structural Errors

### Column names

In [139]:
column_name_change_df = standardize_data_df.copy()

In [140]:
column_name_change_df.columns

Index(['movie_title', 'num_critic_for_reviews', 'duration',
       'DIRECTOR_facebook_likes', 'actor_3_facebook_likes',
       'ACTOR_1_facebook_likes', 'gross', 'num_voted_users',
       'Cast_Total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'budget', 'title_year',
       'ACTOR_2_facebook_likes', 'imdb_score', 'title_year.1'],
      dtype='object')

In [141]:
column_name_change_df = column_name_change_df.rename(
    columns= {
        'movie_title': 'title',
        'num_critic_for_reviews': 'critic_reviews',
        'duration': 'movie_duration',
        'DIRECTOR_facebook_likes': 'director_fb_likes',
        'actor_3_facebook_likes': 'actor3_fb_likes',
        'ACTOR_2_facebook_likes': 'actor2_fb_likes',
        'ACTOR_1_facebook_likes': 'actor1_fb_likes',
        'num_voted_users': 'voted_users',
        'Cast_Total_facebook_likes': 'cast_fb_likes',
        'facenumber_in_poster': 'faces_on_poster',
        'num_user_for_reviews': 'user_reviews',
        'budget': 'movie_budget',
        'title_year.1': 'corrected_title_year'
    }
)

In [142]:
column_name_change_df.head(1)

Unnamed: 0,title,critic_reviews,movie_duration,director_fb_likes,actor3_fb_likes,actor1_fb_likes,gross,voted_users,cast_fb_likes,faces_on_poster,user_reviews,movie_budget,title_year,actor2_fb_likes,imdb_score,corrected_title_year
0,Avatar,723,178,10,855,1000,760505847,886204,4834,1,3054,237000000,2009,936,7.9,2009


In [143]:
cleaned_df = column_name_change_df[[
    'title', 
    'title_year', 
    'corrected_title_year',
    'movie_duration',
    'faces_on_poster',
    'critic_reviews',
    'user_reviews',
    'imdb_score',
    'movie_budget',
    'gross',
    'voted_users',
    'cast_fb_likes',
    'director_fb_likes',
    'actor1_fb_likes',
    'actor2_fb_likes',
    'actor3_fb_likes'
    ]]

In [146]:
cleaned_df.reset_index(drop=True, inplace=True)

In [147]:
cleaned_df

Unnamed: 0,title,title_year,corrected_title_year,movie_duration,faces_on_poster,critic_reviews,user_reviews,imdb_score,movie_budget,gross,voted_users,cast_fb_likes,director_fb_likes,actor1_fb_likes,actor2_fb_likes,actor3_fb_likes
0,Avatar,2009,2009,178,1,723,3054,7.9,237000000,760505847,886204,4834,10,1000,936,855
1,Pirates of the Caribbean: At World's End,2007,2007,145,2,302,1238,7.1,300000000,309404152,471220,48350,563,40000,5000,1000
2,Spectre,2015,2015,148,1,602,994,6.8,245000000,200074175,275868,11700,20,11000,393,161
3,The Dark Knight Rises,2012,2012,145,5,813,2701,8.5,250000000,448130642,1144337,106759,22000,27000,23000,23000
4,John Carter,2012,2012,132,1,462,738,6.6,263700000,73058679,212204,1873,475,640,632,530
5,Spider-Man 3,2007,2007,156,2,392,1902,6.2,258000000,336530303,383056,46055,23,24000,11000,4000
6,Tangled,2010,2010,145,1,324,387,7.8,260000000,200807262,294810,9720,15,799,553,284
7,Avengers: Age of Ultron,2015,2015,141,4,635,1117,7.5,250000000,458991599,462669,92000,10,26000,21000,19000
8,Avengers: Age of Ultron,2015,2015,141,4,635,1117,7.5,250000000,458991599,462669,92000,10,26000,21000,19000
9,Harry Potter and the Half-Blood Prince,2009,2009,153,3,375,973,7.5,250000000,301956980,321795,58753,282,25000,11000,10000


✅ Improved Clarity – Column names are now more intuitive and easier to understand.

✅ Better Organization – Grouped similar attributes together for better readability.

✅ Easier Analysis – Simplifies working with the dataset in visualizations and modeling.

These adjustments make the dataset more user-friendly and ensure smoother analysis for further insights.

## 7. Outliers

After running the outlier detection function with n=2, the function returned an empty list, meaning that there were no multiple outliers identified in the dataset.

In [148]:
from collections import Counter

def delete_outliers(data_frame: pd.DataFrame, n: int, features: list) -> list:
    """
    Detect and Identifies outliers in the specified features of a 
    DataFrame.

    This function iterates through a list of numerical features and detects
    outliers in each feature based on the IQR method. An outlier is defined
    as a value below Q1 - 1.5 * IQR or above Q3 + 1.5 * IQR.
    If an index appears as an outlier in more than N features, it is added to the 
    final list of multiple outliers.

    Parameters:
    ---------------
    data_frame: pd.DataFrame
        The DataFrame containing the data to analyze.

    n: int
        The minimum number of features in which a data point must be an outlier
        to be considered a multiple outlier.

    features: list
        A list of column names (features) to analyze for outliers.
        Those should be numerical columns.

    Returns:
    -------------
    list
        A list of indices corresponding to data points that are outliers
        in more than n features.

    Raises:
    ------------
    TypeError
        If data_frame is not a pandas DataFrame, or if features is not a list.

    ValueError
        If n is negative, or if a feature in the features list is not in
        the DataFrame.
    """
    if not isinstance(data_frame, pd.DataFrame):
        raise TypeError("`data_frame` must be a pandas DataFrame.")
    if not isinstance(features, list):
        raise TypeError("`features` must be a list of column names.")
    if not isinstance(n, int) or n < 0:
        raise ValueError("`n` must be a non-negative integer.")
        
    outliers = []
    for feature in features:
        Q1 = np.percentile(data_frame[feature], 25)
        Q3 = np.percentile(data_frame[feature], 75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR

        outlier_list = data_frame[(data_frame[feature] < lower) | (data_frame[feature] > upper)].index
        outliers.extend(outlier_list)
    outliers = Counter(outliers)
    multiple_outliers = list(key for key, value in outliers.items() if value > n)
    return multiple_outliers

In [149]:
outliers = delete_outliers(cleaned_df, 2, [x for x in cleaned_df.columns if cleaned_df[x].dtype != 'object'])

In [150]:
print(f'Outliers: {outliers}, {len(outliers)} numbers.')

Outliers: [], 0 numbers.


In [151]:
cleaned_df

Unnamed: 0,title,title_year,corrected_title_year,movie_duration,faces_on_poster,critic_reviews,user_reviews,imdb_score,movie_budget,gross,voted_users,cast_fb_likes,director_fb_likes,actor1_fb_likes,actor2_fb_likes,actor3_fb_likes
0,Avatar,2009,2009,178,1,723,3054,7.9,237000000,760505847,886204,4834,10,1000,936,855
1,Pirates of the Caribbean: At World's End,2007,2007,145,2,302,1238,7.1,300000000,309404152,471220,48350,563,40000,5000,1000
2,Spectre,2015,2015,148,1,602,994,6.8,245000000,200074175,275868,11700,20,11000,393,161
3,The Dark Knight Rises,2012,2012,145,5,813,2701,8.5,250000000,448130642,1144337,106759,22000,27000,23000,23000
4,John Carter,2012,2012,132,1,462,738,6.6,263700000,73058679,212204,1873,475,640,632,530
5,Spider-Man 3,2007,2007,156,2,392,1902,6.2,258000000,336530303,383056,46055,23,24000,11000,4000
6,Tangled,2010,2010,145,1,324,387,7.8,260000000,200807262,294810,9720,15,799,553,284
7,Avengers: Age of Ultron,2015,2015,141,4,635,1117,7.5,250000000,458991599,462669,92000,10,26000,21000,19000
8,Avengers: Age of Ultron,2015,2015,141,4,635,1117,7.5,250000000,458991599,462669,92000,10,26000,21000,19000
9,Harry Potter and the Half-Blood Prince,2009,2009,153,3,375,973,7.5,250000000,301956980,321795,58753,282,25000,11000,10000
