In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import networkx as nx
import matplotlib.colors as mcolors
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_data(file_path, sep='\t', header_names=None):
    """
    Load the dataset from a TSV file.
    
    :param file_path (str): The path to the .tsv file containing the data.
    
    :return pandas.DataFrame: The loaded dataset.
    """
    data = pd.read_table(file_path, sep=sep, names=header_names, encoding='latin-1')
    return data

### User Data

In [3]:
user_data_columns = ['user_id', 'item_id', 'rating', 'timestamp']

In [4]:
user_data = load_data('../data/raw/ml-100k/u.data', header_names=user_data_columns)
user_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### User Item

In [5]:
u_item_columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western']

In [6]:
u_item = load_data('../data/raw/ml-100k/u.item', sep='|', header_names=u_item_columns)
u_item.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### User Info

In [7]:
u_user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

In [8]:
u_user = load_data('../data/raw/ml-100k/u.user', sep='|', header_names=u_user_columns)
u_user.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Combined User Data

In [9]:
all_data = (pd.merge(user_data, u_item, left_on='item_id', right_on='movie_id')).merge(u_user)

In [10]:
all_data.describe()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,video_release_date,unknown,Action,Adventure,Animation,...,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western,age
count,100000.0,100000.0,100000.0,100000.0,100000.0,0.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0,425.53013,,0.0001,0.25589,0.13753,0.03605,...,0.01733,0.05317,0.04954,0.05245,0.19461,0.1273,0.21872,0.09398,0.01854,32.96985
std,266.61442,330.798356,1.125674,5343856.0,330.798356,,0.01,0.436362,0.344408,0.186416,...,0.130498,0.224373,0.216994,0.222934,0.395902,0.33331,0.41338,0.291802,0.134894,11.562623
min,1.0,1.0,1.0,874724700.0,1.0,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
25%,254.0,175.0,3.0,879448700.0,175.0,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0
50%,447.0,322.0,4.0,882826900.0,322.0,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
75%,682.0,631.0,4.0,888260000.0,631.0,,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0
max,943.0,1682.0,5.0,893286600.0,1682.0,,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,73.0


In [11]:
all_data.sort_values(by=['user_id'], inplace=True)

In [12]:
all_data.drop('video_release_date', axis=1, inplace=True)

### Save Data

In [13]:
all_data.to_csv('../data/interim/all_data.csv', index=False)

In [14]:
all_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,movie_title,release_date,IMDb_URL,unknown,Action,...,Mystery,Romance,Sci_Fi,Thriller,War,Western,age,gender,occupation,zip_code
1722,1,124,5,875071484,124,Lone Star (1996),21-Jun-1996,http://us.imdb.com/M/title-exact?Lone%20Star%2...,0,0,...,1,0,0,0,0,0,24,M,technician,85711
1643,1,161,4,875072303,161,Top Gun (1986),01-Jan-1986,http://us.imdb.com/M/title-exact?Top%20Gun%20(...,0,1,...,0,1,0,0,0,0,24,M,technician,85711
1642,1,147,3,875240993,147,"Long Kiss Goodnight, The (1996)",05-Oct-1996,http://us.imdb.com/M/title-exact?Long%20Kiss%2...,0,1,...,0,0,0,1,0,0,24,M,technician,85711
1641,1,49,3,878542478,49,I.Q. (1994),01-Jan-1994,http://us.imdb.com/M/title-exact?I.Q.%20(1994),0,0,...,0,1,0,0,0,0,24,M,technician,85711
1640,1,128,4,875072573,128,Supercop (1992),26-Jul-1996,http://us.imdb.com/M/title-exact?Police%20Stor...,0,1,...,0,0,0,1,0,0,24,M,technician,85711


In [15]:
header_names = user_data_columns

base_path = '../data/raw/ml-100k/'
base_save_path = '../data/interim/'

file_prefixes = ['u1', 'u2', 'u3', 'u4', 'u5', 'ua', 'ub']

In [16]:
for prefix in tqdm(file_prefixes, desc='Files', total=len(file_prefixes)):
    train_file = f"{base_path}{prefix}.base"
    test_file = f"{base_path}{prefix}.test"

    # Load the datasets
    train_data = load_data(train_file, header_names=header_names)
    test_data = load_data(test_file, header_names=header_names)

    # Save datasets to CSV
    train_data.to_csv(f"{base_save_path}{prefix}_base.csv", index=False)
    test_data.to_csv(f"{base_save_path}{prefix}_test.csv", index=False)

Files:   0%|          | 0/7 [00:00<?, ?it/s]

Files: 100%|██████████| 7/7 [00:01<00:00,  4.93it/s]


In [17]:
data_for_analitic_genre = load_data('../data/raw/ml-100k/u.genre', sep='|', header_names=['genre', 'count'])

In [18]:
data_for_analitic_genre.head()

Unnamed: 0,genre,count
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [19]:
data_for_analitic_genre.to_csv('../data/interim/data_for_analitic_genre.csv', index=False)

#### Additional Data

In [20]:
headers = ['number', 'names']

In [21]:
additional_info = load_data('../data/raw/ml-100k/u.info', sep=' ', header_names=headers)

In [22]:
additional_info.head()

Unnamed: 0,number,names
0,943,users
1,1682,items
2,100000,ratings


In [23]:
additional_info.to_csv('../data/interim/additional_info.csv', index=False)

#### Movies with ganres

In [24]:
movies = load_data('../data/raw/ml-100k/u.item', sep='|', header_names=u_item_columns)

In [25]:
movies.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Fantasy', 'Film_Noir', 
                 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western', 
                 'Comedy', 'Crime', 'Documentary', 'Drama']

# Function to combine genres
def combine_genres(row):
    return ', '.join([genre for genre in genre_columns if row[genre] == 1])

# Apply the function to each row
movies['genres_combined'] = movies.apply(combine_genres, axis=1)

# Drop the individual genre columns and other unnecessary columns
movies.drop(genre_columns + ['release_date', 'video_release_date', 'IMDb_URL'], axis=1, inplace=True)

In [27]:
movies.head()

Unnamed: 0,movie_id,movie_title,genres_combined
0,1,Toy Story (1995),"Animation, Childrens, Comedy"
1,2,GoldenEye (1995),"Action, Adventure, Thriller"
2,3,Four Rooms (1995),Thriller
3,4,Get Shorty (1995),"Action, Comedy, Drama"
4,5,Copycat (1995),"Thriller, Crime, Drama"


In [28]:
movies.to_csv('../data/interim/movies.csv', index=False)