In [69]:
#Data initialization from dataset and reading shape

import pandas as pd
import numpy as np

#importing movies csv file.

movies = pd.read_csv('tmdb_5000_movies.csv')
print("The shape of csv files ",movies.shape) #reading the shape of csv file
print("Movies columns:",movies.columns.tolist())

The shape of csv files  (4803, 20)
Movies columns: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count']


In [70]:
#Data Exploration

print("Exploring first 3 row datas:")
print(movies.head(3)) #first 3 rows
print("="*60)

print("\nExploring last 3 row datas:")
print(movies.tail(3)) #first 3 rows
print("="*60)

print("\nDataset info")
print(movies.info())
print("="*60)

print("\nListing the number of missing datas ")
missing_data=movies.isnull().sum() #counts null per column
print(missing_data[missing_data>0]) #shows only column with missing data here None




Exploring first 3 row datas:
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "name": "spy"}, {"id": 818, "name...                en   

                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                          

In [71]:
#Data cleaning
clean=movies.copy()

#filling the missing values via median
clean['runtime'] = clean['runtime'].fillna(clean['runtime'].median())
clean['profit']=clean['revenue']-clean['budget'] #addding a new column

print(clean.shape)







(4803, 21)


In [72]:
# BAsic Analysis

print("Total Movies",len(clean))
print("="*60)
print(f"Average RAting {clean['vote_average'].mean():.2f}")
print("="*60)
print(f"Top 5 highest rated movie ")
top_rated=clean.nlargest(5,'vote_average')[['title','release_date','vote_average','genres']]
print(top_rated)

Total Movies 4803
Average RAting 6.09
Top 5 highest rated movie 
                      title release_date  vote_average  \
3519       Stiff Upper Lips   1998-06-12          10.0   
4045  Dancer, Texas Pop. 81   1998-05-01          10.0   
4247  Me You and Five Bucks   2015-07-07          10.0   
4662         Little Big Top   2006-01-01          10.0   
3992              Sardaarji   2015-06-26           9.5   

                                                 genres  
3519                     [{"id": 35, "name": "Comedy"}]  
4045  [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...  
4247  [{"id": 10749, "name": "Romance"}, {"id": 35, ...  
4662                     [{"id": 35, "name": "Comedy"}]  
3992                                                 []  


In [73]:
# filtering and Grouping
high_rated =clean[clean['vote_average'] >= 8]

print(f"TOtal number of movies with rating greater than 8 is {len(high_rated)}")
print("="*60)
print(f"High RAted {high_rated}")


print("="*60)
genre =clean.groupby('genres').agg({
    'vote_average': 'mean',
    'revenue': 'mean',
    'title': 'count'}).round(2).sort_values('vote_average')

print(genre)


TOtal number of movies with rating greater than 8 is 85
High RAted          budget                                             genres  \
65    185000000  [{"id": 18, "name": "Drama"}, {"id": 28, "name...   
77    175000000  [{"id": 18, "name": "Drama"}, {"id": 35, "name...   
95    165000000  [{"id": 12, "name": "Adventure"}, {"id": 18, "...   
96    160000000  [{"id": 28, "name": "Action"}, {"id": 53, "nam...   
262    93000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
...         ...                                                ...   
4606     300000  [{"id": 10402, "name": "Music"}, {"id": 99, "n...   
4662          0                     [{"id": 35, "name": "Comedy"}]   
4678     200000  [{"id": 10402, "name": "Music"}, {"id": 18, "n...   
4679          0                [{"id": 99, "name": "Documentary"}]   
4755      50000                [{"id": 99, "name": "Documentary"}]   

                                          homepage      id  \
65    http://thedarkknight.war

In [75]:
lang = pd.pivot_table(
    clean,
    index='original_language',
    values='title',
    aggfunc='count'
)

print("Movies per Language:")
print(lang)



Movies per Language:
                   title
original_language       
af                     1
ar                     2
cn                    12
cs                     2
da                     7
de                    27
el                     1
en                  4505
es                    32
fa                     4
fr                    70
he                     3
hi                    19
hu                     1
id                     2
is                     1
it                    14
ja                    16
ko                    11
ky                     1
nb                     1
nl                     4
no                     1
pl                     1
ps                     1
pt                     9
ro                     2
ru                    11
sl                     1
sv                     5
ta                     2
te                     1
th                     3
tr                     1
vi                     1
xx                     1
zh                    27
