In [1]:
import pandas as pd
import  numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [2]:
df = pd.read_csv('bollywood_data_set.csv')
print(df.head())

   Unnamed: 0     imdb-id         movie_name year_of_release  runtime  \
0           0  tt10811166  The Kashmir Files          (2022)  170 min   
1           1   tt1187043           3 Idiots          (2009)  170 min   
2           2   tt0986264   Taare Zameen Par          (2007)  165 min   
3           3   tt5074352             Dangal          (2016)  161 min   
4           4   tt2338151                 PK          (2014)  153 min   

  IMDB_rating no_of_votes                                   plot_description  \
0         8.3     548,031  The Kashmir Files' is a story, based on video ...   
1         8.4     387,020  Two friends are searching for their long lost ...   
2         8.3     188,938  An eight-year-old boy is thought to be a lazy ...   
3         8.3     183,452  Former wrestler Mahavir Singh Phogat and his t...   
4         8.1     180,108  An alien on Earth loses the only device he can...   

          director                                             actors  
0  Vivek

In [3]:
df.columns

Index(['Unnamed: 0', 'imdb-id', 'movie_name', 'year_of_release', 'runtime',
       'IMDB_rating', 'no_of_votes', 'plot_description', 'director', 'actors'],
      dtype='object')

In [4]:
df.rename(columns={'Unnamed: 0':'Unique_Id'}, inplace=True)

In [5]:
df.info

<bound method DataFrame.info of       Unique_Id     imdb-id            movie_name year_of_release  runtime  \
0             0  tt10811166     The Kashmir Files          (2022)  170 min   
1             1   tt1187043              3 Idiots          (2009)  170 min   
2             2   tt0986264      Taare Zameen Par          (2007)  165 min   
3             3   tt5074352                Dangal          (2016)  161 min   
4             4   tt2338151                    PK          (2014)  153 min   
...         ...         ...                   ...             ...      ...   
9994       9994   tt3509212         Shaadi Kar Lo          (1978)            
9995       9995   tt3509194            Tatya Tope          (1978)            
9996       9996   tt3509180      Trishala Ka Laal          (1978)            
9997       9997   tt3509174  Karishma: Jamal Tree          (1982)            
9998       9998   tt3509156     Main Tumhari Hoon          (1978)            

     IMDB_rating no_of_votes  \

In [6]:
print(df.head())

   Unique_Id     imdb-id         movie_name year_of_release  runtime  \
0          0  tt10811166  The Kashmir Files          (2022)  170 min   
1          1   tt1187043           3 Idiots          (2009)  170 min   
2          2   tt0986264   Taare Zameen Par          (2007)  165 min   
3          3   tt5074352             Dangal          (2016)  161 min   
4          4   tt2338151                 PK          (2014)  153 min   

  IMDB_rating no_of_votes                                   plot_description  \
0         8.3     548,031  The Kashmir Files' is a story, based on video ...   
1         8.4     387,020  Two friends are searching for their long lost ...   
2         8.3     188,938  An eight-year-old boy is thought to be a lazy ...   
3         8.3     183,452  Former wrestler Mahavir Singh Phogat and his t...   
4         8.1     180,108  An alien on Earth loses the only device he can...   

          director                                             actors  
0  Vivek Agnih

In [7]:
# Split 'actors' column into two separate columns: 'director' and 'actors'
df[['director', 'actors']] = df['actors'].str.split('|', expand=True)

print(df[['director', 'actors']].head())

                      director  \
0              Vivek Agnihotri   
1              Rajkumar Hirani   
2      Aamir Khan, Amole Gupte   
3                Nitesh Tiwari   
4              Rajkumar Hirani   

                                              actors  
0       Mithun Chakraborty, Anupam Kher, Darshan ...  
1       Aamir Khan, Madhavan, Mona Singh, Sharman...  
2       Darsheel Safary, Aamir Khan, Tisca Chopra...  
3       Aamir Khan, Sakshi Tanwar, Fatima Sana Sh...  
4       Aamir Khan, Anushka Sharma, Sanjay Dutt, ...  


In [8]:
df['director'].fillna('Unknown', inplace=True)

In [9]:
# Count the number of non-null values in each column
row_counts = df.count()
print(row_counts)

Unique_Id           9999
imdb-id             9999
movie_name          9999
year_of_release     9799
runtime             9999
IMDB_rating         9999
no_of_votes         9999
plot_description    9999
director            9999
actors              8648
dtype: int64


In [10]:
# Remove parentheses from 'year_of_release' column
df['year_of_release'] = df['year_of_release'].str.strip('()')  # Remove parentheses

# Convert 'year_of_release' to numeric (integer)
df['year_of_release'] = pd.to_numeric(df['year_of_release'], errors='coerce')  # Coerce invalid parsing to NaN
df['year_of_release'] = df['year_of_release'].fillna(0).astype(int)  # Fill NaNs with 0 and convert to int


df['IMDB_rating'] = pd.to_numeric(df['IMDB_rating'], errors='coerce')  # Convert to float

df['no_of_votes'] = df['no_of_votes'].str.replace(',', '')      # Remove commas
df['no_of_votes'] = pd.to_numeric(df['no_of_votes'], errors='coerce').fillna(0).astype(int)

df['actors'].fillna('Unknown', inplace=True)  # Replace NaN with 'Unknown'


In [11]:
print(df.dtypes)

Unique_Id             int64
imdb-id              object
movie_name           object
year_of_release       int32
runtime              object
IMDB_rating         float64
no_of_votes           int32
plot_description     object
director             object
actors               object
dtype: object


In [12]:
# Count the rows that satisfy all conditions
count_to_remove = df[(df['IMDB_rating'].isnull()) & 
                     (df['no_of_votes'].isnull()) & 
                     (df['plot_description'] == 'Add a Plot') & 
                     (df['director'] == 'Unknown') & 
                     (df['actors'] == 'Unknown')].shape[0]

# Print the count of rows that would be removed
print(f"Rows to be removed: {count_to_remove}")


Rows to be removed: 0


In [13]:
# Remove rows that satisfy all conditions
df = df[~((df['IMDB_rating'].isnull()) & 
          (df['no_of_votes'].isnull()) & 
          (df['plot_description'] == 'Add a Plot') & 
          (df['director'] == 'Unknown') & 
          (df['actors'] == 'Unknown'))]

print(df.isnull().sum()) 

Unique_Id              0
imdb-id                0
movie_name             0
year_of_release        0
runtime                0
IMDB_rating         2342
no_of_votes            0
plot_description       0
director               0
actors                 0
dtype: int64


In [14]:
print(df['runtime'].unique())
print(df['runtime'].dtype)

['170 min' '165 min' '161 min' '153 min' '136 min' '135 min' '101 min'
 '167 min' '224 min' '133 min' '321 min' '139 min' '210 min' '163 min'
 '151 min' '156 min' '155 min' '145 min' '134 min' '183 min' '186 min'
 '146 min' '122 min' '123 min' '138 min' '125 min' '152 min' '159 min'
 '141 min' '130 min' '144 min' '104 min' '204 min' '160 min' '198 min'
 '184 min' '148 min' '192 min' '177 min' '143 min' '172 min' '142 min'
 '162 min' '140 min' '185 min' '103 min' '164 min' '154 min' '149 min'
 '132 min' '178 min' '180 min' '179 min' '112 min' '158 min' '147 min'
 '124 min' '128 min' '168 min' '126 min' '213 min' '173 min' '150 min'
 '216 min' '175 min' '109 min' '118 min' '120 min' '129 min' '114 min'
 '174 min' '137 min' '113 min' '171 min' '116 min' '176 min' '206 min'
 '193 min' '106 min' '188 min' '157 min' '166 min' '181 min' '187 min'
 '119 min' '115 min' '117 min' '191 min' '108 min' '100 min' '105 min'
 '95 min' '94 min' '107 min' '197 min' '99 min' '121 min' '131 min'
 '127 min

In [15]:
# Remove the ' min' string and commas from the 'runtime' column
df['runtime'] = df['runtime'].str.replace(' min', '')  # Remove ' min'
df['runtime'] = df['runtime'].str.replace(',', '')     # Remove commas

# Convert 'runtime' to numeric (integer) type, coercing any invalid entries to NaN
df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce')

# Handle missing values by filling them with a default value, e.g., 0 or the mean runtime
df['runtime'].fillna(0, inplace=True)  # Replace NaN with 0 (you can choose a different default if needed)

# Convert to integer
df['runtime'] = df['runtime'].astype(int)

# Print the unique values to confirm
print(df['runtime'].unique())
print(df['runtime'].dtype)




[ 170  165  161  153  136  135  101  167  224  133  321  139  210  163
  151  156  155  145  134  183  186  146  122  123  138  125  152  159
  141  130  144  104  204  160  198  184  148  192  177  143  172  142
  162  140  185  103  164  154  149  132  178  180  179  112  158  147
  124  128  168  126  213  173  150  216  175  109  118  120  129  114
  174  137  113  171  116  176  206  193  106  188  157  166  181  187
  119  115  117  191  108  100  105   95   94  107  197   99  121  131
  127  110  169   83  102   96  111   98   80   88   90   77  190  255
   93  189   97  182  195   92  202   86   89   91    0  207  238  208
   87   82   68   84   78   85  199   65  214   72   57   60   50   48
   79  201  222  247   45   74  200   75   76   67   55  223   62   52
   47   66   49  235   70  240  217   81   69   64   73   61   63  211
   46   51  250   71   56   58   59  298   54   53 1179   34]
int32


In [16]:
from sklearn.linear_model import LinearRegression

# Split data into train and test sets
train_data = df[df['IMDB_rating'].notna()]
test_data = df[df['IMDB_rating'].isna()]

# Select features and target
features = ['runtime', 'year_of_release', 'no_of_votes']  # Add other features as necessary
target = 'IMDB_rating'

# Train a regression model
model = LinearRegression()
model.fit(train_data[features], train_data[target])

# Predict missing IMDB_rating values
predicted_ratings = model.predict(test_data[features])

# Fill missing values with the predicted ratings
df.loc[df['IMDB_rating'].isna(), 'IMDB_rating'] = predicted_ratings



In [17]:
print(df.isnull().sum())
print(df.dtypes)

Unique_Id           0
imdb-id             0
movie_name          0
year_of_release     0
runtime             0
IMDB_rating         0
no_of_votes         0
plot_description    0
director            0
actors              0
dtype: int64
Unique_Id             int64
imdb-id              object
movie_name           object
year_of_release       int32
runtime               int32
IMDB_rating         float64
no_of_votes           int32
plot_description     object
director             object
actors               object
dtype: object


In [18]:
import pandas as pd
df.to_csv('preprocessed_data.csv', index=False)

In [19]:
#movie_name
#year_of_release
#IMDB_rating
#no_of_votes 
#plot_description
#director
#actors
#imdb-id
df = df[['imdb-id','movie_name','plot_description','director','actors','IMDB_rating','no_of_votes','year_of_release']]

In [20]:
df.head(1)

Unnamed: 0,imdb-id,movie_name,plot_description,director,actors,IMDB_rating,no_of_votes,year_of_release
0,tt10811166,The Kashmir Files,"The Kashmir Files' is a story, based on video ...",Vivek Agnihotri,"Mithun Chakraborty, Anupam Kher, Darshan ...",8.3,548031,2022


In [21]:
df['plot_description'] = df['plot_description'].apply(lambda x:x.split())
df['actors'] = list(df['actors'].apply(lambda x:x.split(",")))
df['director'] = list(df['director'].apply(lambda x:x.split(",")))

In [22]:
df['actors'] = df['actors'].apply(lambda x:[i.replace(" ","") for i in x])
df['director'] = df['director'].apply(lambda x:[i.replace(" ","") for i in x])

In [23]:
df['tags'] = df['plot_description'] + df['actors'] + df['director']

In [24]:
new_df = df[['imdb-id','movie_name','tags','year_of_release','no_of_votes','IMDB_rating']]

In [25]:
new_df.head()

Unnamed: 0,imdb-id,movie_name,tags,year_of_release,no_of_votes,IMDB_rating
0,tt10811166,The Kashmir Files,"[The, Kashmir, Files', is, a, story,, based, o...",2022,548031,8.3
1,tt1187043,3 Idiots,"[Two, friends, are, searching, for, their, lon...",2009,387020,8.4
2,tt0986264,Taare Zameen Par,"[An, eight-year-old, boy, is, thought, to, be,...",2007,188938,8.3
3,tt5074352,Dangal,"[Former, wrestler, Mahavir, Singh, Phogat, and...",2016,183452,8.3
4,tt2338151,PK,"[An, alien, on, Earth, loses, the, only, devic...",2014,180108,8.1


In [26]:
import warnings
warnings.filterwarnings('ignore')
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [27]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [28]:
new_df['tags'][0]

"the kashmir files' is a story, based on video interviews of the first generation victims of the genocide of kashmiri pandit community in 1990. mithunchakraborty anupamkher darshankumaar pallavijoshi vivekagnihotri"

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000,stop_words='english')

In [30]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [31]:
!pip install nltk



In [32]:
import nltk

In [33]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [34]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [35]:
ps.stem('running')

'run'

In [36]:
new_df['tags'] = new_df['tags'].apply(stem)

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
similarity[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [38]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:4]

[(7943, 0.3307189138830738), (1401, 0.3125), (4239, 0.3061862178478973)]

In [39]:
new_df[new_df['movie_name'] == 'PK'].index[0]

4

In [40]:
def recommend(movie):
    movie_index = new_df[new_df['movie_name'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:4]
    
    for i in movies_list:
        
        print(new_df.iloc[i[0]].movie_name)

In [41]:
new_df.iloc[1216].movie_name

'Takkari Donga'

In [42]:
recommend('Dangal')

Jango
Dharma
Shehzaade


In [43]:
import pickle
pickle.dump(new_df,open('movies.pkl','wb'))
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))