# Data processing

In [92]:
import pandas as pd
import os

base_path : str = os.path.dirname(os.getcwd())
csv_path  : str = base_path + '\\csv'
src_path  : str = base_path + '\\src'
json_path : str = base_path + '\\json'

## Downloading data

In [52]:
movies = pd.read_csv(csv_path + "\\movie.csv", sep=';', header=None)
movies.columns = ['MOVIE_ID', 'TMDB_ID', 'TITLE']
movies

Unnamed: 0,MOVIE_ID,TMDB_ID,TITLE
0,1,389,12 Angry Men
1,2,62,2001: A Space Odyssey
2,3,20453,3 Idiots
3,4,453,A Beautiful Mind
4,5,185,A Clockwork Orange
...,...,...,...
195,196,8392,Tonari no Totoro
196,197,1480,Touch of Evil
197,198,862,Toy Story
198,199,627,Trainspotting


In [3]:
data = pd.read_csv("C:\\Users\\Michał\\Documents\\STUDIA\\II stopień, Informatyka Stosowana - inżynieria oprogramowania i uczenie maszynowe\\II sem\Machine and Deep Learning\\ML\\csv\\train.csv", sep=';', header=None)
data.columns = ['ID', 'USER_ID', 'MOVIE_ID', 'RATING']
data

Unnamed: 0,ID,USER_ID,MOVIE_ID,RATING
0,1,1642,137,4
1,2,1642,1,5
2,3,1642,136,5
3,4,1642,2,3
4,5,1642,139,5
...,...,...,...,...
32215,32216,1455,126,3
32216,32217,1455,124,3
32217,32218,1455,123,1
32218,32219,1455,122,4


In [101]:
pivot_data = data.pivot_table(values='RATING', columns='USER_ID', index='MOVIE_ID').fillna(-1)
pivot_data

USER_ID,5,12,19,24,31,52,62,63,68,69,...,1767,1769,1779,1781,1785,1796,1804,1813,1815,1816
MOVIE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.0,-1.0,5.0,-1.0,-1.0,3.0,1.0,-1.0,5.0,1.0,...,-1.0,-1.0,3.0,-1.0,-1.0,3.0,-1.0,3.0,-1.0,2.0
2,2.0,-1.0,2.0,5.0,0.0,4.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,5.0,-1.0,1.0,3.0,5.0,-1.0,-1.0
3,-1.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,5.0,2.0,-1.0,2.0,5.0,4.0,2.0
4,-1.0,5.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,0.0,...,-1.0,3.0,-1.0,5.0,-1.0,3.0,4.0,-1.0,3.0,4.0
5,2.0,-1.0,-1.0,5.0,-1.0,-1.0,-1.0,-1.0,5.0,-1.0,...,3.0,4.0,4.0,5.0,3.0,-1.0,-1.0,3.0,2.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,4.0,2.0,4.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,3.0,-1.0,-1.0,2.0,3.0,-1.0,-1.0,-1.0,-1.0,4.0
197,4.0,-1.0,5.0,3.0,-1.0,3.0,3.0,5.0,-1.0,0.0,...,-1.0,-1.0,-1.0,-1.0,2.0,-1.0,-1.0,4.0,-1.0,3.0
198,-1.0,-1.0,-1.0,2.0,4.0,-1.0,1.0,-1.0,-1.0,-1.0,...,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,3.0,3.0,-1.0,2.0
199,-1.0,2.0,5.0,-1.0,1.0,-1.0,0.0,5.0,-1.0,0.0,...,4.0,4.0,3.0,1.0,-1.0,-1.0,4.0,-1.0,-1.0,-1.0


In [93]:
data.to_json(json_path+'\\data.json')

In [143]:
for user_id in pivot_data.columns:
    ratings: list = [element for element in enumerate(pivot_data[user_id].tolist()) if element[1] != -1]
    non_ratings: list = [element for element in enumerate(pivot_data[user_id].tolist()) if element[1] == -1]
    rated_movies = {
        'USER_ID': user_id,
        'RATINGS': [element[1] for element in ratings],
        'MOVIE_IDS': [element[0] for element in ratings]
    }
    not_rated_movies = {
        'USER_ID': user_id,
        'RATINGS': [element[1] for element in non_ratings],
        'MOVIE_IDS': [element[0] for element in non_ratings]
    }
    ratings_for_user = pd.DataFrame(rated_movies)
    non_ratings_for_user = pd.DataFrame(not_rated_movies)

    ratings_for_user.to_json(json_path + f"\\{user_id}_RATINGS.json")
    non_ratings_for_user.to_json(json_path + f"\\{user_id}_NAN_RATINGS.json")


## Connecting to TMDB

In [2]:
def download_movie_data_From_TMDB(movie_id: int) -> list:
    
    import requests
    import os

    API_KEY: str = os.getenv("TMDB_API_KEY")
    url = f'https://api.themoviedb.org/3/movie/{movie_id}'
    other_url =f'https://api.themoviedb.org/3/movie/{movie_id}/credits'

    params = {
        'api_key': API_KEY,
        'language': 'eng'  
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        movie_data = response.json()

        response = requests.get(other_url, params=params)
        if response.status_code == 200:
            other_data = response.json()
            return [movie_data, other_data]
        else:
            print(f"Błąd w other data dla {movie_id}")
            return [movie_data, None]
    else:
        print(f"Błąd dla {movie_id}: {response.status_code} - {response.text}")
        return None

    

In [149]:
movie_data = []

for movie_id in range(200):

    output, other_data = download_movie_data_From_TMDB(movie_id=movies['TMDB_ID'][movie_id])

    actors_list = sorted([[actor['original_name'], actor['popularity']] for actor in other_data['cast']], key=lambda x: x[1], reverse=True)
    actors_list = actors_list[:10] if len(actors_list) > 10 else actors_list
    actors_list = [actor[0] for actor in actors_list]
    
    movie_info = {
        'MOVIE_ID': movie_id+1,
        'TITLE': movies['TITLE'][movie_id],
        'TMDB_ID': movies['TMDB_ID'][movie_id],
        'title': output['title'],
        'director': [person['original_name'] for person in other_data['crew'] if 'Director' == person['job']],
        'production_companies': [output['production_companies'][index]['name'] for index in range(len(output['production_companies']))],
        'genres': [output['genres'][index]['name'] for index in range(len(output['genres']))],
        'popularity': output['popularity'],
        'rating': output['vote_average'],
        'actors': actors_list,
        'overview': output['overview']
    }
    
    movie_data.append(movie_info)

movie_Df = pd.DataFrame(movie_data)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Michał\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Michał\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Michał/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\Michał\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [129]:
movie_Df

Unnamed: 0,MOVIE_ID,TITLE,TMDB_ID,title,director,production_companies,genres,popularity,rating,actors,overview
0,1,12 Angry Men,389,12 Angry Men,[Sidney Lumet],"[United Artists, Orion-Nova Productions]",[Drama],70.722,8.545,"[Lee J. Cobb, Henry Fonda, Martin Balsam, Jack...",The defense and the prosecution have rested an...
1,2,2001: A Space Odyssey,62,2001: A Space Odyssey,[Stanley Kubrick],"[Kubrick Productions, Metro-Goldwyn-Mayer]","[Science Fiction, Mystery, Adventure]",77.925,8.073,"[Stanley Kubrick, Keir Dullea, Gary Lockwood, ...",Humanity finds a mysterious object buried bene...
2,3,3 Idiots,20453,3 Idiots,[Rajkumar Hirani],"[Vidhu Vinod Chopra Productions, Vinod Chopra ...","[Drama, Comedy]",30.586,8.000,"[Kareena Kapoor Khan, Aamir Khan, Ali Fazal, J...",Rascal. Joker. Dreamer. Genius... You've never...
3,4,A Beautiful Mind,453,A Beautiful Mind,[Ron Howard],"[Universal Pictures, DreamWorks Pictures, Imag...","[Drama, Romance]",49.376,7.900,"[Russell Crowe, Jennifer Connelly, Bryce Dalla...","Brilliant mathematician, John Nash, is on the ..."
4,5,A Clockwork Orange,185,A Clockwork Orange,[Stanley Kubrick],"[Warner Bros. Pictures, Hawk Films, Kubrick Pr...","[Science Fiction, Crime]",63.517,8.200,"[Malcolm McDowell, Gillian Hills, Adrienne Cor...","In a near-future Britain, young Alexander DeLa..."
...,...,...,...,...,...,...,...,...,...,...,...
195,196,Tonari no Totoro,8392,My Neighbor Totoro,[宮崎駿],"[Studio Ghibli, Nibariki, Tokuma Shoten]","[Fantasy, Animation, Family]",75.908,8.100,"[中村大樹, 大谷育江, 千葉繁, 坂本千夏, 日髙のり子, 島本須美, 平松晶子, 龍田直...",Two sisters move to the country with their fat...
196,197,Touch of Evil,1480,Touch of Evil,[Orson Welles],[Universal International Pictures],"[Crime, Thriller, Mystery]",27.881,7.757,"[Orson Welles, Charlton Heston, Dennis Weaver,...",When a car bomb explodes on the American side ...
197,198,Toy Story,862,Toy Story,[John Lasseter],[Pixar],"[Animation, Adventure, Family, Comedy]",122.612,8.000,"[Tom Hanks, Annie Potts, Wallace Shawn, Tim Al...","Led by Woody, Andy's toys live happily in his ..."
198,199,Trainspotting,627,Trainspotting,[Danny Boyle],"[Figment Films, Film4 Productions, The Noel Ga...","[Drama, Crime]",41.585,8.000,"[Ewan McGregor, Kelly Macdonald, Kevin McKidd,...",Heroin addict Mark Renton stumbles through bad...


In [130]:
movie_Df.to_csv(csv_path + "\\MOVIES_DATA.csv")

In [15]:
out1, out2 = download_movie_data_From_TMDB(movie_id=333)
text = out1['overview']
from gensim.summarization import keywords

# Extract keywords using TextRank
text_keywords = keywords(text, words=5, split=True, lemmatize=True)

print("Top Keywords using TextRank:", text_keywords)

ModuleNotFoundError: No module named 'gensim.summarization'