# Analysis and cleaning of The Movie Database
source: https://www.kaggle.com/rounakbanik/the-movies-dataset?select=movies_metadata.csv

In [3]:
import numpy as np
import pandas as pd
import datetime
import re
import csv
import json
import os

In [4]:
# Read in "The Movies dataset"
# if this is read in as csv, each row is a string and dictionaries are not recognized.
df=pd.read_csv("Resources/movies_metadata.csv", low_memory=False)
print(df.describe())
print(df.columns)
print(df.dtypes)
print(df.isnull().sum())

            revenue       runtime  vote_average    vote_count
count  4.546000e+04  45203.000000  45460.000000  45460.000000
mean   1.120935e+07     94.128199      5.618207    109.897338
std    6.433225e+07     38.407810      1.924216    491.310374
min    0.000000e+00      0.000000      0.000000      0.000000
25%    0.000000e+00     85.000000      5.000000      3.000000
50%    0.000000e+00     95.000000      6.000000     10.000000
75%    0.000000e+00    107.000000      6.800000     34.000000
max    2.787965e+09   1256.000000     10.000000  14075.000000
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')
adult                     object
belong

In [5]:
# Variable Disposition:
# Drop: adult, overview, poster_path, tagline, video, original title, title
#     candidates for NLP: overview, tagline (both objects)  
#     description variables:can be dropped
#       original_title
#       title
#
# Transform: release_date -> release year, voter_average -> success
# Transform to binary: belongs_to_collection -> collection, homepage-> website,
# Transform to categories: original_language -> orig_lang_cd
# Transform to numeric: budget
# handling null values:
#   release_year -> drop
#   runtime -> 0
#   status -> drop
# handling zero values
#   runtime: 1535 = 0, + 246 nulls converted to 0
#
# Keep: 
#   float: release_year, runtime, revenue, vote_count, budget
#   categorical: success,collection, homepage, original language
#   identifiers: id, imdb_id (both objects)
#
# filter: release_year>1969 and status = Released

# Issues:
#   budget: 76% zero
#   revenue: 81%
#   popularity: is not described

# Problems: transforming strings that look like json, but have single rather than double quotes
# production_countries      object
# production_companies      object
# genres                    object
# spoken_languages          object

In [6]:
df.head(10)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,1995-12-15,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0
6,False,,58000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,1995-12-15,0.0,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0
7,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,1995-12-22,0.0,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0
8,False,,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,1995-12-22,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0
9,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,1995-11-16,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0


In [7]:
# Drop movies released prior to 1970. This helps reduce some of the missing value problems.
# clean up and change release_date to datetime
# save at each change to make reverting easy
r_date=df["release_date"]
r_date.str.strip()  #remove spaces
form1=r'(^[0-9]{4})'
rel_year = r_date.str.extract(f'({form1})')  #extract just year
rel_year["release_year"]=pd.to_numeric(rel_year[0])
print(rel_year.dtypes)
rel_year.drop([0,1],axis=1,inplace=True)
print(f'null rows: {rel_year.isnull().sum()}')
rel_year.dropna(how="any", inplace=True)
print(f'number of movies: {len(rel_year)}')
print(rel_year.dtypes)

0                object
1                object
release_year    float64
dtype: object
null rows: release_year    90
dtype: int64
number of movies: 45376
release_year    float64
dtype: object


In [8]:
rel_year=rel_year[rel_year["release_year"] > 1969]
year_counts=rel_year["release_year"].value_counts()
print('year counts:')
print(year_counts)

year counts:
2014.0    1974
2015.0    1905
2013.0    1889
2012.0    1722
2011.0    1667
2016.0    1604
2009.0    1586
2010.0    1501
2008.0    1473
2007.0    1320
2006.0    1270
2005.0    1125
2004.0     992
2002.0     905
2003.0     882
2001.0     865
2000.0     789
1999.0     723
1998.0     722
1997.0     661
1996.0     633
1995.0     599
1994.0     544
2017.0     532
1993.0     489
1988.0     467
1987.0     462
1992.0     453
1989.0     439
1990.0     427
1991.0     426
1986.0     391
1972.0     381
1971.0     378
1982.0     368
1985.0     368
1984.0     362
1980.0     361
1981.0     360
1973.0     356
1983.0     353
1970.0     351
1974.0     348
1979.0     338
1977.0     334
1976.0     333
1975.0     332
1978.0     321
2018.0       5
2020.0       1
Name: release_year, dtype: int64


In [9]:
len(rel_year)

37087

In [10]:
movies_year_df = pd.concat([df, rel_year], axis=1, ignore_index=False, join="inner")
len(movies_year_df)

37087

In [11]:
print(movies_year_df.columns)
movies_year_df["release_year"].value_counts()

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'release_year'],
      dtype='object')


2014.0    1974
2015.0    1905
2013.0    1889
2012.0    1722
2011.0    1667
2016.0    1604
2009.0    1586
2010.0    1501
2008.0    1473
2007.0    1320
2006.0    1270
2005.0    1125
2004.0     992
2002.0     905
2003.0     882
2001.0     865
2000.0     789
1999.0     723
1998.0     722
1997.0     661
1996.0     633
1995.0     599
1994.0     544
2017.0     532
1993.0     489
1988.0     467
1987.0     462
1992.0     453
1989.0     439
1990.0     427
1991.0     426
1986.0     391
1972.0     381
1971.0     378
1982.0     368
1985.0     368
1984.0     362
1980.0     361
1981.0     360
1973.0     356
1983.0     353
1970.0     351
1974.0     348
1979.0     338
1977.0     334
1976.0     333
1975.0     332
1978.0     321
2018.0       5
2020.0       1
Name: release_year, dtype: int64

In [12]:
print(movies_year_df["adult"].value_counts())
movies_year_df.drop(["adult", "overview", "poster_path", "tagline"], axis=1, inplace=True)

False    37079
True         8
Name: adult, dtype: int64


In [13]:
pd.set_option("max_rows", None)
movies_year_df["original_language"].value_counts()

en    25951
fr     1945
it     1123
ja     1034
de      904
es      898
ru      663
hi      493
ko      439
zh      393
sv      320
cn      312
pt      291
fi      259
nl      234
da      206
pl      188
tr      146
no      100
fa       98
cs       94
el       88
hu       85
ta       78
th       76
he       65
sr       57
ro       54
te       45
ml       36
ar       36
hr       28
et       24
mr       24
is       24
tl       23
id       19
lv       17
sl       16
bn       13
bs       13
ca       12
xx       12
uk       10
vi       10
ka       10
ab        9
bg        9
ur        7
lt        7
sk        6
nb        6
sq        5
mk        5
sh        4
wo        4
ms        4
kn        3
ky        3
kk        3
bm        3
ku        3
mn        2
eu        2
ps        2
iu        2
lo        2
ne        2
pa        2
af        2
am        2
bo        2
hy        1
zu        1
fy        1
tg        1
jv        1
rw        1
cy        1
uz        1
la        1
sm        1
gl        1
si  

In [14]:
# Keep the top languages and set all others to "other"
orig_lang_cd=movies_year_df["original_language"]

def recode_lang(y):
    if (y in ["en", "fr", "it", "ja", "de", "es", "ru"]):
        return y
    else:
        return "other"
orig_lang_cd=orig_lang_cd.map(recode_lang)
orig_lang_cd.name="orig_lang_cd"
print(f'Original language counts: {orig_lang_cd.value_counts()}')
print(f'movie count original language: {len(orig_lang_cd)}')
print(f'movie count movies_year_df: {len(movies_year_df)}')

# Update original language - default join is outer
movies_year_df= pd.concat([movies_year_df, orig_lang_cd], axis=1, ignore_index=False, join="inner")
print(len(movies_year_df)) 

# drop original column
movies_year_df.drop(["original_language"], axis=1, inplace=True)

Original language counts: en       25951
other     4569
fr        1945
it        1123
ja        1034
de         904
es         898
ru         663
Name: orig_lang_cd, dtype: int64
movie count original language: 37087
movie count movies_year_df: 37087
37087


In [15]:
pd.reset_option("max_rows")
# Convert some variables to binary:
# convert belongs_to_collection, homepage and success to binary(0,1)
print(movies_year_df["belongs_to_collection"].notnull().sum())
collection=movies_year_df["belongs_to_collection"].notnull().replace([True, False], ["yes", "no"])
collection.name="collection"
print(collection.value_counts())

print(movies_year_df["homepage"].notnull().sum())
website=movies_year_df["homepage"].notnull().replace([True, False], ["yes", "no"])
website.name="website"
print(website.value_counts())

success=movies_year_df["vote_average"].apply(lambda z: "yes" if z > 5 else "no")
success.name="success"
print(success.value_counts())

clean_df = pd.concat([movies_year_df, collection, website, success], axis=1, ignore_index=False, join="inner")
clean_df.drop(["belongs_to_collection", "homepage", "vote_average"], axis=1, inplace=True)
print(clean_df.head())


3919
no     33168
yes     3919
Name: collection, dtype: int64
7680
no     29407
yes     7680
Name: website, dtype: int64
yes    27444
no      9643
Name: success, dtype: int64
     budget                                             genres     id  \
0  30000000  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...    862   
1  65000000  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   8844   
2         0  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...  15602   
3  16000000  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...  31357   
4         0                     [{'id': 35, 'name': 'Comedy'}]  11862   

     imdb_id               original_title popularity  \
0  tt0114709                    Toy Story  21.946943   
1  tt0113497                      Jumanji  17.015539   
2  tt0113228             Grumpier Old Men    11.7129   
3  tt0114885            Waiting to Exhale   3.859495   
4  tt0113041  Father of the Bride Part II   8.387519   

                                production_compan

In [16]:
# # drop any null values because they cause the loss function to be null
# print(clean_df.isnull().sum())
# print(clean_df[clean_df["runtime"].isna()])  # 4 rows have NaN runtime
# clean_df.dropna(axis=0, how="any", inplace=True)
# print(clean_df[clean_df["runtime"].isna()])
# print(clean_df.columns)
# print(clean_df.dtypes)
# print(clean_df.head())

In [17]:
# Look at:
# runtime                   object
# budget                    object
# original_title            object
# popularity                object
# poster_path               object
# spoken_languages          object
# status                    object
# title                     object
# video                     object

In [18]:
clean_df.columns

Index(['budget', 'genres', 'id', 'imdb_id', 'original_title', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title', 'video',
       'vote_count', 'release_year', 'orig_lang_cd', 'collection', 'website',
       'success'],
      dtype='object')

In [19]:
pd.reset_option("max_rows")
clean_df_subset = clean_df[["budget", "popularity", "revenue", "runtime", "status", "video"]].copy()
# count nulls
clean_df_subset.isnull().sum()


budget          0
popularity      0
revenue         0
runtime       216
status         78
video           0
dtype: int64

In [20]:
# clean_df_subset.value_counts(dropna=False) known bug, supposed to be fixed in v 1.2 of pandas?
col_list= clean_df_subset.columns
for col in col_list:
    print(f'({col} {clean_df_subset[col].value_counts()}')
# drop video because mostly False

(budget 0           28815
5000000       278
10000000      255
20000000      240
15000000      222
            ...  
114000          1
107             1
74000           1
2073816         1
9500            1
Name: budget, Length: 1014, dtype: int64
(popularity 1e-06       56
0.000308    42
0.0         40
0.00022     39
0.000578    38
            ..
7.435688     1
5.914833     1
0.717502     1
0.546136     1
1.288235     1
Name: popularity, Length: 35819, dtype: int64
(revenue 0.0           30058
12000000.0       13
11000000.0       13
10000000.0       12
500000.0         12
              ...  
33472850.0        1
745327.0          1
40263020.0        1
10600497.0        1
38702310.0        1
Name: revenue, Length: 6653, dtype: int64
(runtime 90.0     2287
0.0      1407
100.0    1269
95.0     1205
93.0     1044
         ... 
313.0       1
289.0       1
566.0       1
370.0       1
501.0       1
Name: runtime, Length: 338, dtype: int64
(status Released           36692
Rumored              1

In [21]:
# Filter on status
clean_df=clean_df[clean_df["status"].notnull()]
print(len(clean_df))
clean_df=clean_df[clean_df["status"].eq("Released")]
print(len(clean_df))

# Cancelled and Planned have low vote counts, release dates 2015+, except for a few in 2012, Cancelled has release year 2003, some have runtimes and some are zero
# In Production all have zero revenue, and budget, # votes < 100, except one with 219, release_year 2009 +, mostly 2015+
# Post Production mostly missing revenue and budget, vote counts < 100, most have release dates 2014+
# Rumored mostly missing budget and revenue, low vote counts, release dates all over the place.
# Null mostly missing revenue and budget, have low vote counts, release years all over.
# Looks like status not equal to released is a pretty good indication the movie hasn't been updated with up-to-date information
# Release year is not reliable because it can be planned or expected and not actual
# 

37009
36692


In [22]:
clean_df_subset = clean_df[["budget", "popularity", "revenue", "runtime"]].copy()
print(clean_df_subset.isnull().sum())
col_list= clean_df_subset.columns
for col in col_list:
    print(f'({col} {clean_df_subset[col].value_counts()}')

budget          0
popularity      0
revenue         0
runtime       211
dtype: int64
(budget 0           28445
5000000       278
10000000      255
20000000      240
15000000      222
            ...  
74000           1
1182273         1
16600000        1
4466000         1
9500            1
Name: budget, Length: 1010, dtype: int64
(popularity 1e-06       56
0.000308    41
0.00022     38
0.0         38
0.000578    37
            ..
0.567032     1
7.435688     1
5.914833     1
0.717502     1
1.288235     1
Name: popularity, Length: 35464, dtype: int64
(revenue 0.0           29675
11000000.0       13
12000000.0       13
500000.0         12
10000000.0       12
              ...  
3558669.0         1
33472850.0        1
745327.0          1
40263020.0        1
38702310.0        1
Name: revenue, Length: 6641, dtype: int64
(runtime 90.0     2264
0.0      1352
100.0    1257
95.0     1193
93.0     1032
         ... 
452.0       1
298.0       1
317.0       1
287.0       1
501.0       1
Name: runti

In [23]:
# Budget: has 28,445 with value 0 out of a total 36,692 movies 76%
# Revenue: has 29,674 with value 0 out of a total 36,692 movies 81%
# Runtime: has 211 missing, 1,352 with value 0 = 4%, convert runtime missing to 0
# Popularity has no missing, 56 effectvely zero but need to find out what this is.

In [24]:
# Transform budget to numeric
clean_df["budget"] = pd.to_numeric(clean_df["budget"], errors="coerce")

In [25]:
print(clean_df.columns)
clean_df.drop(["video", "title", "release_date", "status"], axis=1, inplace=True)
print(clean_df.columns)


Index(['budget', 'genres', 'id', 'imdb_id', 'original_title', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title', 'video',
       'vote_count', 'release_year', 'orig_lang_cd', 'collection', 'website',
       'success'],
      dtype='object')
Index(['budget', 'genres', 'id', 'imdb_id', 'original_title', 'popularity',
       'production_companies', 'production_countries', 'revenue', 'runtime',
       'spoken_languages', 'vote_count', 'release_year', 'orig_lang_cd',
       'collection', 'website', 'success'],
      dtype='object')


In [146]:
# Below is working on reading the strings that look like a list of dictionaries
# example: [{'iso_3166_1': 'DE', 'name': 'Germany'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]
# result wanted: columns: DE, US, which are values of key='name'
# Ultimately need columns labeled DE, US, etc. and each movie has a 0 1 indicator for the column
# method extractall keeps the series index and the level one index and a second index called match is created which corrresponds 
# to the order in which the regex capture groups were found

import re
prod_ctry = clean_df["production_countries"]

# extract country code in a series with a multi-level index 
form = r"\{'iso_3166_1':\s'(?P<prd_cty>\w{2})"
prod_ctry_cds=prod_ctry.str.extractall(form)

# print data about the extract
# print(f'number of unique country codes: {prod_ctry_cds["prd_cty"].nunique()}')
# print(f'print list of unique country codes: {prod_ctry_cds["prd_cty"].unique()}')
# print(f'print counts of each value: {prod_ctry_cds["prd_cty"].value_counts()}')
# counts=prod_ctry_cds["prd_cty"].value_counts()

# create empty data frame with index from clean_data and columns = unique values of country code
# pre-fill with zeroes
new_cols = prod_ctry_cds["prd_cty"].unique() 

result_df = pd.DataFrame(0, index=clean_df.index.copy(), columns=new_cols)
# print(result_df.head())

# iterate through index with iterrows()
# iterrows returns a tuple with row index as a tuple and row data as a series
# to limit number of rows use: for index, row in prod_ctry_cds.head().iterrows():
for index, row  in prod_ctry_cds.iterrows():
#     print(f'index: {index}, type: {type(index)}')
#     print(f'row: {row}, type: {type(row)}')
    # use column names to access the value in each row
#     print(f'original series index: {index[0]}')
#     print(f'production country: {row["prd_cty"]}')
    # write to the initialized dataframe
    result_df.loc[index[0], row["prd_cty"]] = 1

pd.set_option("max_columns", None)
print(result_df.head(10))
number_countries=result_df.sum(axis=1, min_count=1)
print(number_countries.head(10))
print(f'frequencies of number of countries {number_countries.value_counts()}')
# counts=result_df.value_counts()
# print(counts.head(10))
# print(f'number of production countries per movie: {result_df.sum(axis=0)}')

    
    







   US  DE  GB  FR  IT  ES  CN  AU  ZA  CA  CH  BE  JP  IR  NL  HK  TN  IE  DO  \
0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
2   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
3   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
4   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
5   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
6   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
7   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
8   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
9   1   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   

   HR  RU  MK  AT  TW  NZ  MX  PL  PE  CU  LI  DK  PT  FI  SE  AR  IS  KR  RS  \
0   0   0   0   0   0   0  

In [79]:
print(type(prod_ctry_cds))
print(prod_ctry_cds.index)     

<class 'pandas.core.frame.DataFrame'>
MultiIndex([(    0, 0),
            (    1, 0),
            (    2, 0),
            (    3, 0),
            (    4, 0),
            (    5, 0),
            (    6, 0),
            (    6, 1),
            (    7, 0),
            (    8, 0),
            ...
            (45454, 0),
            (45458, 0),
            (45459, 0),
            (45460, 0),
            (45460, 1),
            (45460, 2),
            (45460, 3),
            (45462, 0),
            (45463, 0),
            (45465, 0)],
           names=[None, 'match'], length=40638)


In [44]:
# pd.set_option("max_colwidth", None)
# test_df=clean_df.copy()
# def convert_to_json(str0):
#     str1=str(str0).replace("'", '"')
#     str2='{"production_countries": ' + str1 +'}'
#     return str2
# test_df["test_input"]=test_df["production_countries"].map(convert_to_json)
# test_input=pd.DataFrame(test_df["production_countries"].map(read_json))
# print(test_df.head())
# test_input=test_df["test_input"]  
# test_series=test_df["test_input"].apply(json.loads)
# print(test_series[6])

# stdf = df['stats'].apply(json.loads)
# stlst = list(stdf)
# stjson = json.dumps(stlst)
# df.join(pandas.read_json(stjson))

# stdf = df['stats'].apply(json.loads)
# pd.DataFrame(stdf.tolist()) # or stdf.apply(pd.Series)
# or alternatively in one step:
# df.join(df['stats'].apply(json.loads).apply(pd.Series))

# res=test_df["ctry_json"].apply(json.loads)
# test_df["ctry_result"]=test_df["ctry_json"].apply(json.loads)
# print(type(test_df["ctry_result"][0]))
# print(test_df["ctry_result"][0])

In [None]:
# import re
# pd.set_option("max_columns", None)
# production_companies is a string that looks like a list of dictionaries
# format [{},{}] so a list of dictionaries
# example: [{'iso_3166_1': 'DE', 'name': 'Germany'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]
# this is not valid JSON code because JSON expects only double quotes
# can't get ast.literal_eval to work, it chokes on lists
# try parsing with regex capture groups in pandas
# [{'iso_3166_1': 'DE', 'name': 'Germany'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]

# pd_countries=movies_year_df["production_countries"]
# print(pd_countries[6])
# form1=r'(\{.*}\})'



# test_movies["production_countries"] = test_movies["production_countries"].fillna('[]').apply(ast.literal_eval)
# # cannot iterate over a list because it is not hashable, use list comprehension?
# test_movies["production_countries"]=test_movies["production_countries"].apply(lambda x: np.nan if pd.isnull(x) else ast.literal_eval(x))
# print(test_movies[6])

# test_movies["production_countries"] = test_movies["production_countries"].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else[])
# print(type(test_movies["production_companies"][1]))
# print(test_movies.head())
# print(pd_countries[0])
# print(isinstance(pd_countries[10],list))

# def recode_cty(y):
#     if isinstance(y,list):
#         return y
#     else:
#         return []

# pd_countries=pd_countries.map(recode_cty)
# pd_countries.value_counts()

# pd_countries = pd_countries.apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
# df["production_countries"].head()

# pd_countries.name="production_countries"
# code from https://www.kaggle.com/danofer/movies-data-clean

# df['production_countries'] = df['production_countries'].fillna('[]').apply(ast.literal_eval)
# df['production_countries'] = df['production_countries'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
# pd.reset_option("max_rows")
# pd.set_option("max_colwidth", None)
# import json
# import re

# production_companies is a string that looks like a list of dictionaries
# format [{},{}] so a list of dictionaries
# example: [{'iso_3166_1': 'DE', 'name': 'Germany'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]
# this is not valid JSON code because JSON expects only double quotes
# for converesion see https://stackoverflow.com/questions/55600788/replace-single-quotes-with-double-quotes-but-leave-ones-within-double-quotes-unt/63862387#63862387

# pd_countries=df["production_countries"]
# print(pd_countries.head())
# # print(pd_countries.value_counts())

# identify non-string values, nonstring=False
# orig_ctry_num=pd_countries[pd.to_numeric(pd_countries,errors='coerce').isnull()]
# pd_countries.head()
# rel_year["release_year"]=pd.to_numeric(rel_year[0])
# temp=orig_lang_cd[orig_lang_cd] in ["en", "fr", "it", "ja", "de", "es", "ru"]
# temp.head()

# def singleQuoteToDoubleQuote(singleQuoted):
#     '''
#     convert a single quoted string to a double quoted one
#     Args:
#         singleQuoted(string): a single quoted string e.g. {'cities': [{'name': "Upper Hell's Gate"}]}
#     Returns:
#         string: the double quoted version of the string e.g. 
#     see
#         - https://stackoverflow.com/questions/55600788/python-replace-single-quotes-with-double-quotes-but-leave-ones-within-double-q 
#     '''
#     cList=list(singleQuoted)
#     inDouble=False;
#     inSingle=False;
#     for i,c in enumerate(cList):
#         #print ("%d:%s %r %r" %(i,c,inSingle,inDouble))
#         if c=="'":
#             if not inDouble:
#                 inSingle=not inSingle
#                 cList[i]='"'
#         elif c=='"':
#             inDouble=not inDouble
#     doubleQuoted="".join(cList)    
#     return doubleQuoted


# pd_countries=pd_countries.map(singleQuoteToDoubleQuote)
# print(pd_countries_df.head())

# pd_countries=df["production_countries"].apply(json.loads)
# pd_countries.apply(pd.Series)
# pd_countries.head()

# one: step
# df.join(df['stats'].apply(json.loads).apply(pd.Series))







In [None]:
# # production_companies is a string that looks like a list of dictionaries
# # filter rows to keep only those with 'iso_3166_1' = 'US'

        
# pd_countries=df["production_countries"]  # pd_countries is a series
# form1=r"'iso_3166_1':\s*('US')"
# country = pd_countries.str.extract(f'({form1})')
# country.drop([0],axis=1,inplace=True)
# print(country.notnull().sum())
# country.dropna(how="any", inplace=True)
# len(country)


In [None]:
# Below is preprocessing for the model



In [None]:
clean_cat=clean_df.dtypes[clean_df.dtypes == "object"].index.tolist()
print(clean_cat)

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Create a OneHotEncoder instance
enc =OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df =pd.DataFrame(enc.fit_transform(clean_df[clean_cat]))
# Add the encoded variable names to the DataFrame
encode_df.columns =enc.get_feature_names(clean_cat)
encode_df.head()

In [None]:
# Merge one-hot encoded features and drop the originals
clean_df =clean_df.merge(encode_df,left_index=True,right_index=True)
print(clean_df.head())
print(type(clean_df))

In [None]:
clean_df.drop(clean_cat,axis=1, inplace=True)
print(clean_df)

In [None]:
movies_year_df = pd.concat([df, rel_year, country], axis=1, ignore_index=False, join="inner")
len(movies_year_df)

In [None]:
movies_year_df.columns

In [None]:
movies_year_df["release_year"].value_counts()

In [None]:
# Save to csv
file_path ="Resources/segment1_input.csv"
movies_year_df.to_csv(file_path,index=False)
