In [5]:
import json

cleaned_ratings = {}

with open("indian movie data/ratings.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            record = json.loads(line.strip())
            for user_id, ratings in record.items():
                if user_id.lower().startswith("submit") or not isinstance(ratings, dict):
                    continue

                valid_ratings = {}
                for movie_id, rating in ratings.items():
                    try:
                        # Only convert if rating is str/int/float (not list/dict/etc.)
                        if isinstance(rating, (int, float, str)) and not isinstance(rating, bool):
                            valid_ratings[movie_id] = int(float(rating))  # convert string like "4" to int
                    except:
                        continue

                if valid_ratings:
                    cleaned_ratings[user_id] = valid_ratings

        except json.JSONDecodeError:
            continue

# Save the cleaned JSON
with open("indian movie data/cleaned_ratings.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_ratings, f, indent=4)

print("✅ Cleaned and saved to 'cleaned_ratings.json'")


✅ Cleaned and saved to 'cleaned_ratings.json'


In [11]:
# Step 1: Load JSON Lines correctly
import json
import pandas as pd

movies = []
with open("indian movie data/movies.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            movie = json.loads(line)
            movies.append(movie)
        except json.JSONDecodeError:
            continue

# Step 2: Convert to DataFrame
movies_df = pd.DataFrame(movies)

# Step 3: Check what keys/columns are present
print("Available columns:", movies_df.columns.tolist())
print(movies_df.head(3))
# Save the cleaned movies DataFrame as JSON
movies_df.to_json("cleaned_movies.json", orient="records", lines=True, force_ascii=False)
print("✅ Saved cleaned_movies.json successfully!")



Available columns: ['_id', 'rating', 'movie_id', 'description', 'language', 'released', 'rating_cnt', 'writer', 'director', 'cast', 'img_src', 'genre', 'inserted', 'name']
                                    _id  rating   movie_id  \
0  {'$oid': '581994c40ad09b0701a0e727'}     7.9  tt5286444   
1  {'$oid': '581994c40ad09b0701a0e728'}     7.9  tt4434004   
2  {'$oid': '581994c50ad09b0701a0e729'}     7.5  tt0248126   

                                         description                language  \
0  Neerja is the story of the courageous Neerja B...                 [Hindi]   
1  A story that revolves around drug abuse in the...        [Hindi, Panjabi]   
2  Yashvardhan Raichand lives a very wealthy life...  [Hindi, English, Urdu]   

                                    released  rating_cnt  \
0  {'$date': '2016-02-19T05:30:00.000+0530'}           0   
1  {'$date': '2016-06-17T05:30:00.000+0530'}           0   
2  {'$date': '2001-12-14T05:30:00.000+0530'}           0   

                 

In [16]:
import pandas as pd
import json

# Load cleaned movies
with open("indian movie data/cleaned_movies.json", "r", encoding="utf-8") as f:
    movies = [json.loads(line) for line in f]
movies_df = pd.DataFrame(movies)

# Load cleaned ratings
with open("indian movie data/cleaned_ratings.json", "r", encoding="utf-8") as f:
    ratings_data = json.load(f)
ratings_df = pd.DataFrame(ratings_data)

# Confirm
print("Movies:", movies_df.shape)
print("Ratings:", ratings_df.shape)


Movies: (2850, 14)
Ratings: (0, 0)


In [18]:
import pandas as pd
import json

# Load cleaned movies
with open("indian movie data/cleaned_movies.json", "r", encoding="utf-8") as f:
    movies_data = [json.loads(line) for line in f]
movies_df = pd.DataFrame(movies_data)

# Load cleaned ratings
with open("indian movie data/cleaned_ratings.json", "r", encoding="utf-8") as f:
    ratings_data = [json.loads(line) for line in f]
ratings_df = pd.DataFrame(ratings_data)

# Show shapes
print("✅ Movies shape:", movies_df.shape)
print("✅ Ratings shape:", ratings_df.shape)

# Show columns
print("\n🎬 Movie columns:", list(movies_df.columns))
print("⭐ Rating columns:", list(ratings_df.columns))


✅ Movies shape: (2850, 14)
✅ Ratings shape: (1, 0)

🎬 Movie columns: ['_id', 'rating', 'movie_id', 'description', 'language', 'released', 'rating_cnt', 'writer', 'director', 'cast', 'img_src', 'genre', 'inserted', 'name']
⭐ Rating columns: []


In [21]:
with open("indian movie data/ratings.json", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(f"Line {i+1}: {line.strip()}")
        if i == 4:
            break  # just print first 5 lines


Line 1: { "_id" : "11megha89", "rated" : { "tt0104561" : [ "1" ], "tt0323013" : [ "1" ], "tt2213054" : [ "0" ], "tt1447508" : [ "0" ], "tt4505006" : [ "0" ], "tt0248126" : [ "1" ], "tt0420332" : [ "1" ], "tt0284137" : [ "1" ], "tt0296574" : [ "-1" ], "tt0367110" : [ "1" ], "tt2960140" : [ "0" ], "tt2356180" : [ "1" ], "tt0306434" : [ "1" ], "tt0088986" : [ "0" ], "tt3863552" : [ "0" ], "tt0116763" : [ "0" ], "tt1324059" : [ "1" ], "submit" : [ "submit" ], "tt3848892" : [ "1" ], "tt0107166" : [ "1" ], "tt4900716" : [ "1" ], "tt5286444" : [ "1" ], "tt4088588" : [ "0" ], "tt0110076" : [ "1" ], "tt1188996" : [ "1" ], "tt0299108" : [ "1" ] } }
Line 2: { "_id" : "9953547227 ", "rated" : { "tt0060689" : [ "0" ], "tt0375066" : [ "0" ], "tt0887769" : [ "0" ], "tt0106333" : [ "1" ], "tt0056806" : [ "0" ], "tt0085776" : [ "0" ], "tt1904875" : [ "0" ], "tt0316407" : [ "0" ], "tt0328810" : [ "0" ], "tt0306434" : [ "0" ], "tt3261022" : [ "1" ], "tt0066758" : [ "0" ], "tt0283005" : [ "0" ], "tt007278

In [22]:
import json
import pandas as pd

# Load the raw nested ratings data
with open("indian movie data/ratings.json", "r", encoding="utf-8") as f:
    raw_lines = f.readlines()

cleaned_ratings = []

# Iterate through each user rating block
for line in raw_lines:
    try:
        entry = json.loads(line)
        user_id = entry.get("_id")
        rated = entry.get("rated", {})

        # Loop over each rated movie
        for movie_id, score_list in rated.items():
            if movie_id == "submit":  # skip "submit": ["submit"] entries
                continue
            try:
                score = int(score_list[0])
                if score in [0, 1, -1]:
                    cleaned_ratings.append({
                        "user_id": user_id,
                        "movie_id": movie_id,
                        "rating": score
                    })
            except:
                continue
    except json.JSONDecodeError:
        continue

# Convert to DataFrame
ratings_df = pd.DataFrame(cleaned_ratings)
print("✅ Cleaned Ratings shape:", ratings_df.shape)
print("🎯 Sample:\n", ratings_df.head())

# Save cleaned version
ratings_df.to_json("cleaned_ratings.json", orient="records", lines=True, force_ascii=False)


✅ Cleaned Ratings shape: (20652, 3)
🎯 Sample:
      user_id   movie_id  rating
0  11megha89  tt0104561       1
1  11megha89  tt0323013       1
2  11megha89  tt2213054       0
3  11megha89  tt1447508       0
4  11megha89  tt4505006       0


In [28]:
import pandas as pd

# Load cleaned ratings file
cleaned_ratings = pd.read_json("indian movie data/cleaned_ratings.json",lines=True)
print(cleaned_ratings.shape)
print(cleaned_ratings.head())


(20652, 3)
     user_id   movie_id  rating
0  11megha89  tt0104561       1
1  11megha89  tt0323013       1
2  11megha89  tt2213054       0
3  11megha89  tt1447508       0
4  11megha89  tt4505006       0
