# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Import dataset

In [None]:
data=pd.read_csv(r"C:\Users\Akash Yadav\Desktop\dataset\movie_dataset.csv")

# Explore the dataset..

In [None]:
data.head(10)

In [None]:
# as we are able to see that in release column the we need to clean the data..

In [None]:
data.shape # so there are 2000 rows and 10 columns..

In [None]:
data.dtypes # datatypes of the all columns ..

In [None]:
data.duplicated().sum() # it will show all duplicate rows

In [None]:
data.isnull().sum() # here it will show all the null values column wise..

In [None]:
data.info() # it gives all the basics info about the file..

In [None]:
sns.heatmap(data.isnull())
# heatmap will help to visualise in which range our values are NULL of every column.

# Data cleaning..

In [None]:
# first delete out all those row which do not contain any data..
data =data.dropna(how = 'all')
data.shape

#seems like there is no row like that..

In [None]:
#lets change the name of the column run time as we need to clean data on the basis of it..
data.rename(columns={'runtime min':'runtime'},inplace=True)
data.head()

In [None]:
#here we can cleary see all the movie/row where our data release is greater that lenth 4 so we need to remove it..
data[data['release'].str.len()>4]

In [None]:
# so lets delete these rows
data.drop(data[data['release'].str.len()>4].index,inplace=True)
data.shape 
# so we can see that we have successfully deleted all those data..

In [None]:
# also lets delete all the rows where the release is less than 4 length
data.drop(data[data['release'].str.len()<4].index,inplace=True)
data.shape

In [None]:
# now lets delete all the rows where the runtime is NULL
data[data['runtime'].isnull()]

In [None]:
# lets delete these rows as they are of no use...
data.drop(data[data['runtime'].isnull()].index,inplace=True)
data.shape
# so here we can see that we also deleted that data..

# Movies Production Every Year..

In [None]:
data.dtypes # here we can see that after deleting all the data still the 'release' column is a object type we need to convert it into a int

In [None]:
data['release']=data['release'].astype(str).astype(int) # this will covert the release column into the int values..
# we can also use below like to the same..
#data['release'] = pd.to_numeric(data['release'])
data.dtypes

In [None]:
condition = data['release'].astype(int) > 1970

prodcount = (data[condition][["release","rating"]] # here the there is no use of rating we just are taking it as to store the total no of count of movies each year..as we cannot take a varibale which is not a column it will show error..
             .groupby("release")
             .count()
            )

prodcount.tail()

In [None]:
(prodcount.rolling(5).mean().rename(columns={"rating":"count"}) # since there was no use of the rating column we have stored count in it so we will rename it as count
 .plot(figsize=(15,5),
       title="Number of Movies - by production year")
)
# rolling 5-> means more curved as the value will decrease the curves will become more pointy

# top 10 rated movies of all time..

In [None]:
data.sort_values('rating',ascending=False).head(10)

# top 10 longest running time movies

In [None]:
data.sort_values('runtime',ascending=False).head(10)

# top 10 highest grosser of all time..

In [None]:
data.sort_values('gross_in_USD',ascending=False).head(10)

# Top Movies Genres

In [None]:
# lets see pie chart of the genres 

In [None]:
nn=data['genre']
dic={}
for i in nn:
    k=i.split(',')
    for j in k:
        v=j.replace(" ","")
        if(v in dic):
            dic[v]+=1
        else:
            dic[v]=1

In [None]:
nam=[]
val=[]
for i in dic:
    nam.append(i)
    val.append(dic[i])

exp=[0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
plt.title('Most popular genre in Bollywood')
plt.pie(val,labels=nam,explode=exp,autopct="%.2f%%")
plt.axis('equal',radius=1,frame=True)
plt.show()

In [None]:
# so the most popular genre in india is drama then romance and then comedy..

# Analysis of Best actors 

In [None]:
# first of all coverting the 'votes' from object to the integer..

data['votes']=data['votes'].apply(lambda x:x.replace(',','')).astype(str).astype(float)

print(data.dtypes) #this is clearly telling us that we have successfully converted the 'votes' columns in integer/float 


In [None]:
ac=data['actor'].unique() # all the actors 
aver_rat=[]
box_off=[]
pub_vot=[]
no_mov=[]

for i in ac:
    
    no_mov.append(data[data['actor']==i]['movie_name'].count()) # the no of the movies the actor has done as a lead actor..
    
    y=data[data['actor']==i][['rating','gross_in_USD','votes']] # collecting data from the sheet
    
    aver_rat.append(y['rating'].mean()) # storing the average of all actors..
    box_off.append(y['gross_in_USD'].sum()) # storing the box office collection of all stars
    pub_vot.append(y['votes'].sum()) # storing no of votes they have recieved till date by audience


Converting this data into a csv datset so that we can access and manipulate the dataset easily..

In [None]:
actors=pd.DataFrame({'actor_name':ac,'no_of_movies':no_mov,'collection':box_off, 'avg_rating':aver_rat, 'votes':pub_vot})
actors.head()

# Best rated stars of Bollywood..(Atleast have done 15 movies)

In [None]:
# lets decide the actors which has done movies>1
akash=actors[(actors['no_of_movies']>=15) & (actors['avg_rating']>6)].sort_values('avg_rating',ascending=False).head(10)

In [None]:
plt.title("Best actors according to ratings\n")
sns.barplot(akash['avg_rating'],akash['actor_name']) # horizontal barplot can be obtained by writing first y axis elements and then x axis elements
plt.show()

# Biggest Box-Office Attractions

In [None]:
# top 10 actors with the highest collections...
abhi=actors.sort_values('collection',ascending=False).head(10)

In [None]:
plt.title("Best actors according to Box office\n")
sns.barplot(abhi['collection'],abhi['actor_name']) # horizontal barplot can be obtained by writing first y axis elements and then x axis elements
plt.show()

# Peoples most loved superstars

In [None]:
ans=actors.sort_values('votes',ascending=False).head(10)

In [None]:
plt.title("Most loved superstars\n")
sns.barplot(abhi['votes'],abhi['actor_name'])
plt.show()

#  Analysis of the best Director

In [None]:
dic=data['director'].unique()
d_avg=[]
d_collect=[]
d_mov=[]
d_vot=[]
for i in dic:
    x=data[data['director']==i]['movie_name'].count()
    y=data[data['director']==i][['rating','gross_in_USD','votes']]
    d_avg.append(y['rating'].mean())
    d_collect.append(y['gross_in_USD'].sum())
    d_vot.append(y['votes'].sum())
    d_mov.append(x)


converting this data into a csv dataset that we can easily access the data and manulpulate it easily..

In [None]:
direct=pd.DataFrame({'direct_name':dic,'no_of_movies':d_mov,'collection':d_collect, 'avg_rating':d_avg, 'votes':d_vot})
direct.head()

# Director which means quality

In [None]:
ds=direct[(direct['no_of_movies']>=5) & (direct['avg_rating']>5)]

In [None]:
abc=ds.sort_values('avg_rating',ascending=False).head(11)

In [None]:
plt.title("Best directors accoring to the ratings\n")
sns.barplot(abc['avg_rating'],abc['direct_name'])
plt.show()

# Director which means box office

In [None]:
cll=direct.sort_values('collection',ascending=False).head(10)

In [None]:
plt.title("Most profitable directors\n")
sns.barplot(cll['collection'],cll['direct_name'])
plt.show()

# Director which means MASS

In [None]:
mass=direct.sort_values('votes',ascending=False).head(10)

In [None]:
plt.title("Most loves directors\n")
sns.barplot(mass['votes'],mass['direct_name'])
plt.show()

# Conclusion : the success mantra..

# Director:
Rajkumar Hirani is cleary the best option by far in best director category. He is one man army,this guy means serious business.

# Actor:
for actor it is tie between the two khans of the bollywood, Aamir khan and Shah Rukh Khan,both are good actors as per IMDB as well as biggest box office attractions...
but if you are looking for the actors with great acting skills you should consider Irfan Khan and Manoj Bajpayee..

# Actress:
Deepika padukone is only female superstar according to public voting and box office collection..

# Movie Genre:
Drama,Rommance,comedy and action are best genre in descending order in the bollywood ..
