In [2]:
import pandas as pd

def create_dataframe_from_file(filepath):
    data_rows = []
    current_movie_id = None
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            # Check if the line is a movie ID
            if line.endswith(':'):
                current_movie_id = line[:-1]
            else:
                # If it's a data line, process it
                if current_movie_id:
                    try:
                        customer_id, rating, date = line.split(',')
                        data_rows.append({
                            'userId': customer_id,
                            'itemId': current_movie_id,
                            'rating': rating,
                            'timestamp': date
                        })
                    except ValueError:
                        # Skip lines that are not in the 'customer,rating,date' format
                        pass

    # Create DataFrame from the list of dictionaries
    df = pd.DataFrame(data_rows)

    # Convert columns to the correct data types
    df['userId'] = pd.to_numeric(df['userId'])
    df['itemId'] = pd.to_numeric(df['itemId'])
    df['rating'] = pd.to_numeric(df['rating'])
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Ensure the columns are in the desired order
    df = df[['userId', 'itemId', 'rating', 'timestamp']]

    return df


In [None]:
ratings_df = create_dataframe_from_file("../data/netflix-prize/combined_data_1.txt")
ratings_df.to_csv("../data/netflix/ratings.csv", index=False)

In [4]:
len(ratings_df['userId'].unique())

470758

In [5]:
len(ratings_df['itemId'].unique())

4499