In [18]:
import pandas as pd
import numpy as np
import ast

## Loading Dataset

In [19]:
movies = pd.read_csv('dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('dataset/tmdb_5000_credits.csv')

In [20]:
# First row from the movies dataframe
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [21]:
# First row
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [22]:
# Merging the datasets based on the title column (why not id? - probably because column names are different)
movies = movies.merge(credits, on='title')
movies.shape

(4809, 23)

In [23]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [24]:
# Language distribution of movies
print(movies['original_language'].value_counts())
print()

# Unique languages present
number_of_languages = movies['original_language'].nunique()
print(number_of_languages)

original_language
en    4510
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
ko      12
cn      12
ru      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
id       2
cs       2
ta       2
ro       2
ar       2
te       1
hu       1
xx       1
af       1
is       1
tr       1
vi       1
pl       1
nb       1
ky       1
no       1
sl       1
ps       1
el       1
Name: count, dtype: int64

37


## Data Cleaning and Preprocessing
We filter out unnecessary columns which won't help us in creating tags necessary for the content-based recommender system. Our goal is to keep the column data which we can use later for creating content-based tags to describe the movies for our model.

Columns kept:
- genres (useful for content similiarity)
- id (getting details, posters etc. for website creation)
- keywords (list of keywords describing the movie)
- title (keeping this instead of `original_title` as ~95% movies are in English, so there will be a consistent English title for each movie)
- overview (clearly shows content description)
- cast (movies with similar actors)
- crew (movies with the same directors)


In [25]:
movies_cleaned = movies[['id', 'genres', 'keywords', 'title', 'overview', 'cast', 'crew']]
movies_cleaned.head()

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [26]:
# Counting number of NULL VALUES
movies_cleaned.isnull().sum()
# shows overview field has 3 null values

id          0
genres      0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

In [27]:
# movies_cleaned = movies_cleaned.fillna('')

movies_cleaned.dropna(inplace=True) # Modifies the DF directly by dropping rows with null values in any of the columns
# movies_cleaned.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned.dropna(inplace=True) # Modifies the DF directly by dropping rows with null values in any of the columns


In [28]:
# Counting number of DUPLICATED VALUES
movies_cleaned.duplicated().sum()

np.int64(0)

In [29]:
# helper function to get the names from lists object
def extract_genres_keywords(obj):
    list_of_names = []
    for i in ast.literal_eval(obj):
        list_of_names.append(i['name'])
    return list_of_names

def extract_cast(obj):
    list_of_names = []
    for i in ast.literal_eval(obj):
        list_of_names.append(i['name'])
        if len(list_of_names) >= 5:
            break
    return list_of_names

In [30]:
# Cleaning the genres column
# movies_cleaned.iloc[0] --- returns the first row in the dataframe

movies_cleaned.iloc[0].genres # list of dictionaries from which we extract the values of the names into a list for data preprocessing
movies_cleaned['genres'] = movies_cleaned['genres'].apply(extract_genres_keywords)

# Cleaning the keywords column
movies_cleaned.iloc[0].keywords
movies_cleaned['keywords'] = movies_cleaned['keywords'].apply(extract_genres_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['genres'] = movies_cleaned['genres'].apply(extract_genres_keywords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['keywords'] = movies_cleaned['keywords'].apply(extract_genres_keywords)


In [31]:
# Extracting the names of the top 5 actors from the cast column
movies_cleaned['cast'] = movies_cleaned['cast'].apply(extract_cast)
movies_cleaned.iloc[0].cast

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['cast'] = movies_cleaned['cast'].apply(extract_cast)


['Sam Worthington',
 'Zoe Saldana',
 'Sigourney Weaver',
 'Stephen Lang',
 'Michelle Rodriguez']

In [32]:
def get_director_name(obj):
    director = []
    obj = ast.literal_eval(obj)
    for i in obj:
        if i['job'] == 'Director':
            director.append(i['name'])
            return director

movies_cleaned['crew'] = movies_cleaned['crew'].apply(get_director_name)
movies_cleaned.iloc[0].crew


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['crew'] = movies_cleaned['crew'].apply(get_director_name)


['James Cameron']

In [33]:
movies_cleaned.head()

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


### Data Preprocessing
We create a new dataframe using `movies_cleaned` with three columns : `id`, `title`, `tags`. The tags column is created by combining information from `overview`, selecting the names from `genres` and `keywords`, picking the names of the top 5 actors in `cast` and the director name from the `crew`. The whole text paragraph with these details is stored in the `tags` coulmn.

In [34]:
# Removing the spaces between words in the columns to avoid creation of separate entities for a keyword, cast names, gneres etc. when creating the tags

movies_cleaned['genres'] = movies_cleaned['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies_cleaned['keywords'] = movies_cleaned['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies_cleaned['cast'] = movies_cleaned['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies_cleaned['crew'] = movies_cleaned['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['genres'] = movies_cleaned['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['keywords'] = movies_cleaned['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m

TypeError: 'NoneType' object is not iterable

In [35]:
# Checking for null values
movies_cleaned.isnull().sum()

id           0
genres       0
keywords     0
title        0
overview     0
cast         0
crew        30
dtype: int64

In [36]:
# Removing null values from crew
movies_cleaned['crew'] = movies_cleaned['crew'].fillna('[Unknown]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['crew'] = movies_cleaned['crew'].fillna('[Unknown]')


In [37]:
movies_cleaned['crew'] = movies_cleaned['crew'].apply(lambda x:[i.replace(" ", "") for i in x])
movies_cleaned['overview'] = movies_cleaned['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['crew'] = movies_cleaned['crew'].apply(lambda x:[i.replace(" ", "") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['overview'] = movies_cleaned['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])


In [38]:
movies_cleaned.head(1)

Unnamed: 0,id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]


In [39]:
# Creating the tags column by combining the overview, genres, keywords, cast and crew columns

movies_cleaned['tags'] = movies_cleaned['overview'] + movies_cleaned['genres'] + movies_cleaned['keywords'] + movies_cleaned['cast'] + movies_cleaned['crew']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_cleaned['tags'] = movies_cleaned['overview'] + movies_cleaned['genres'] + movies_cleaned['keywords'] + movies_cleaned['cast'] + movies_cleaned['crew']


In [40]:
movies_cleaned.iloc[0].tags

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.',
 'Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'StephenLang',
 'MichelleRodriguez',
 'JamesCameron']

In [41]:
# Creating a new movies dataframe using the cleaned columns for further processing
movies_df = movies_cleaned[['id', 'title', 'tags']]
movies_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [42]:
movies_df['tags'] = movies_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['tags'] = movies_df['tags'].apply(lambda x:" ".join(x))


In [43]:
movies_df['tags'] = movies_df['tags'].apply(lambda x:x.lower())
movies_df.iloc[2].tags

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['tags'] = movies_df['tags'].apply(lambda x:x.lower())


'a cryptic message from bond’s past sends him on a trail to uncover a sinister organization. while m battles political forces to keep the secret service alive, bond peels back the layers of deceit to reveal the terrible truth behind spectre. action adventure crime spy basedonnovel secretagent sequel mi6 britishsecretservice unitedkingdom danielcraig christophwaltz léaseydoux ralphfiennes monicabellucci sammendes'

## Vectorization
We convert the tags into vectors, using `Bag-of-Words` model, in order to calculate the similarity between the movies. We combine all the 5000 tags and make a big text of these words. From this big text, we extract the 5000 most frequently occuring words. We then go over every tag and compare every word in the 5000 most frequently occuring set and count how many times they occuring in the tag.

|            | word1 | word2 | word3 | ... | word5000 |
|------------|-------|-------|-------|-----|----------|
| movie tag 1 | 66    | 75    | 8     |     | 3        |
| movie tag 2 | 67    | 33    | 64    |     | 56       |
| movie tag 3 | 78    | 6     | 78    |     | 18       |
| ...        |       |       |       |     |          |
| movie tag 5000 | 77    | 444   | 36    |     | 68       |

Every row in this (5000, 5000) matrix is a vector corresponding to a film.
We use the `CountVectorizer` in `sklearn.feature_extraction.text` to convert the collection of text documents into a matrix of token (word) counts.

### 🔍 What It Does
Given a list of text documents, `CountVectorizer`:
- Tokenizes the text (splits it into words)
- Builds a vocabulary of known words
- Counts how many times each word appears in each document

It returns a sparse matrix where:
- Rows represent documents
- Columns represent vocabulary words
- Values are the word counts

### ⚙️ Common Parameters
- `stop_words='english'`: Removes common English words like “the”, “is”, “and”
- `max_features=5000`: Limits to the top 5000 words
- `ngram_range=(1,2)`: Includes unigrams, bigrams, etc.
- `min_df=2`: Ignores words in fewer than 2 documents

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

vectors = cv.fit_transform(movies_df['tags']).toarray()

In [45]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      shape=(5000,), dtype=object)

In [46]:
# We apply STEMMING to ensure that words like 'actions', 'action' etc. get grouped into one category as they have the same base word.
# Similarly, words like 'loved', 'loving', 'loves' get grouped under 'love'.

import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_text(text):
    stemmed_words = []
    for word in text.split():
        stemmed_words.append(ps.stem(word))
    
    return " ".join(stemmed_words)

movies_df['tags'] = movies_df['tags'].apply(stem_text)
movies_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['tags'] = movies_df['tags'].apply(stem_text)


Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [47]:
vectors = cv.fit_transform(movies_df['tags']).toarray()

In [48]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      shape=(5000,), dtype=object)

### Calculating similarity

To find movies which are similar to each other, we need to compute the distance between their corresponding vectors. In higher dimensions, Eucledian distance computation fails so we use the **cosine-similarity** which measures the angle between the two vectors. Smaller the angle, more similiar are the vectors.

Similiarity is the inverse of distance. Range(similarity) = [0, 1].

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

# This computes the distance of each vector from every other vector
similarity_matrix = cosine_similarity(vectors)

similarity_matrix.shape

(4806, 4806)

In [50]:
# Similarity of the first movie with every other movie
similarity_matrix[0]

array([1.        , 0.07897472, 0.0836242 , ..., 0.04484485, 0.        ,
       0.        ], shape=(4806,))

In [63]:
# Returns 5 similar movies for the provided movie
def recommend(movie):
    # Fetching the index of the movie
    movie_index = movies_df[movies_df['title'] == movie].index[0]
    
    # Getting the vector distances for this movie from the similarity matrix
    distances = similarity_matrix[movie_index]

    # Sorting the distances to find the top 5 similar movies
    sorted_distances = sorted(list(enumerate(similarity_matrix[movie_index])), reverse=True)  
    # Here, list(enumerate(similarity_matrix[0])) returns a list of tuples where every tupple consists of the movie_number (row number)
    # and its similairty with the provided movie
    # We do the enumerate because we need to keep track of the movie index to get the required ID from the movies dataframe. Sorting on its own messes up the movie index
    # sorted_distances = sorted(list(enumerate(similarity_matrix[0])), reverse=True)   --- this sorts on the basis of the index, we need to sort on the basis of the distance

    sorted_distances = sorted(list(enumerate(similarity_matrix[movie_index])), reverse=True, key=lambda x:x[1])
    print(sorted_distances)

    top_5_movies = sorted_distances[1:6]
    for i in top_5_movies:
        print(movies_df.iloc[i[0]].title)
        print(i[0])
    # return top_5_movies

In [64]:
recommend('Batman Begins')

[(119, np.float64(1.0)), (65, np.float64(0.40516337191403146)), (1361, np.float64(0.34817875908197216)), (1360, np.float64(0.32826608214930636)), (3, np.float64(0.32297112073308687)), (4146, np.float64(0.2969278434243359)), (3295, np.float64(0.29274527692217917)), (9, np.float64(0.2904011597483873)), (3823, np.float64(0.28365431446558775)), (2248, np.float64(0.27690930984749823)), (3730, np.float64(0.2722074211540951)), (1248, np.float64(0.26802813370944867)), (3259, np.float64(0.26633805697108515)), (4103, np.float64(0.2646888890589147)), (4641, np.float64(0.2626128657194451)), (1452, np.float64(0.26100665575458365)), (2958, np.float64(0.25992434305464396)), (210, np.float64(0.25733746430109117)), (1411, np.float64(0.2553310899934462)), (4139, np.float64(0.2553310899934462)), (428, np.float64(0.25438386528639634)), (79, np.float64(0.2481458334927325)), (3236, np.float64(0.24770754201107464)), (3308, np.float64(0.24140393963016737)), (1254, np.float64(0.24000391846330957)), (813, np.fl