In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
print('Setup Completed')

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv')

In [None]:
import pandas_profiling as pf
pf.ProfileReport(df)

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe().round(2).transpose()

In [None]:
#how much of missing values in the dataset
df.isnull().sum() *100 / len(df)

In [None]:
sns.heatmap(df.isnull())

In [None]:
# lets drop missing data
df.dropna(how='any',inplace=True)

In [None]:
## checking duplicated data 
df[df.duplicated]
# as you can see No duplicate data 

In [None]:
df.shape # we dropped all missing values so the shape decreases

### Now Let's Analysis The Data

In [None]:
df.sample(2)

In [None]:
# title of movies that have runtime more than or equal 180 min
df[df['Runtime'] >= '180 min']['Series_Title']

In [None]:
# In which year was the highest average voting
df.groupby('Released_Year')['No_of_Votes'].mean().sort_values(ascending=False)

In [None]:
# highest average voting
df.groupby('Released_Year')['No_of_Votes'].mean().max()

In [None]:
plt.figure(figsize=(12,6),dpi=100)
df.groupby('Released_Year')['No_of_Votes'].mean().sort_values(ascending=False).head(15).plot(kind='bar',color='green')
plt.xlabel('Year',size=20)
plt.ylabel('Votes',size=20)
plt.title('Top 15 average votes by year',size=20)
plt.show()

In [None]:
df['Gross'] = df['Gross'].fillna(0)

df.pivot_table(values=['Gross','No_of_Votes','Meta_score'],
                index='Released_Year',
              aggfunc='mean').sort_values(by=['Meta_score','No_of_Votes'],ascending=False).plot()

In [None]:
# average rating for each director
df.groupby('Director')['IMDB_Rating'].mean().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(12,6),dpi=100)
df.groupby('Director')['IMDB_Rating'].mean().sort_values(ascending=False).head(10).plot(kind='bar')
plt.xlabel('Director',size=15)
plt.ylabel('Rating',size=15)
plt.title('Top 10 average rating for each directo',size=15)
plt.show()

In [None]:
df['Runtime'] = df['Runtime'].str.replace('min','') # remove minutes to convert all the column to numeric

In [None]:
df['Runtime'] =pd.to_numeric(df['Runtime'])

In [None]:
# top lenghty movies
top_10_movies = df.sort_values(by='Runtime',ascending=False)[['Series_Title','Runtime']].head(10)

In [None]:
sns.barplot(x='Series_Title',y='Runtime',data =top_10_movies)
plt.title('Top 10 lenghy movies')
plt.xticks(rotation=90)

In [None]:
sns.pairplot(df)

In [None]:
df.head(2)

In [None]:
df['Released_Year'].unique()

In [None]:
plt.figure(figsize=(12,6),dpi=100)
years_over_2000 = df[df['Released_Year'] >= '2000']
sns.lineplot(x='Released_Year',y='Gross',data =years_over_2000 )
plt.xticks(rotation=90)