In [2]:
import pandas as pd
import glob
import re

# Read all CSVs that start with imdb_ and end with .csv
files = glob.glob("imdb_*.csv")

# Merge into one DataFrame
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# Save combined file
df.to_csv("imdb_movies", index=False)

print(f"✅ Merged {len(files)} files, total {len(df)} movies saved to imdb_all_genres_2024.csv")


✅ Merged 6 files, total 7972 movies saved to imdb_all_genres_2024.csv


In [3]:
df_unique_names = df.drop_duplicates(subset=['title'])

In [4]:
print(df_unique_names)


                                                title duration  rating voters  \
0                               1. Mahavatar Narsimha   2h 10m     9.2    32K   
1                                            2. Relay   1h 52m     7.3   1.3K   
2                                     3. Freaky Tales   1h 47m     6.3    11K   
3                                   4. Dune: Part Two   2h 46m     8.5   660K   
4                                     5. Gladiator II   2h 28m     6.5   270K   
...                                               ...      ...     ...    ...   
7967                        2245. Mind and Distortion      57m     NaN    NaN   
7968  2246. Yean Yeathuku Yeppadi from Dishoreabinash      49m     NaN    NaN   
7969                                  2247. The Birth      NaN     NaN    NaN   
7970                                  2248. Seclusion      18m     NaN    NaN   
7971                            2249. Drive All Night      NaN     NaN    NaN   

         genre  
0       Ac

In [5]:
df.dtypes

title        object
duration     object
rating      float64
voters       object
genre        object
dtype: object

In [6]:
print("🔍 Duplicate Count:", df.duplicated().sum())
df = df.drop_duplicates()
print("✅ After removing duplicates:", df.shape)

🔍 Duplicate Count: 0
✅ After removing duplicates: (7972, 5)


In [7]:
if "title" in df.columns:
    # Extract number from title if present
    df["sl_no"] = df["title"].str.extract(r"^(\d+)")
    df["title"] = df["title"].str.replace(r"^\d+\.\s*", "", regex=True)

    # Convert to integer (nullable int first)
    df["sl_no"] = df["sl_no"].astype(pd.Int64Dtype())

    # Auto-generate if missing
df["sl_no"] = df["sl_no"].fillna(pd.Series(df.index + 1, index=df.index)).astype(int)

In [8]:
df

Unnamed: 0,title,duration,rating,voters,genre,sl_no
0,Mahavatar Narsimha,2h 10m,9.2,32K,Action,1
1,Relay,1h 52m,7.3,1.3K,Action,2
2,Freaky Tales,1h 47m,6.3,11K,Action,3
3,Dune: Part Two,2h 46m,8.5,660K,Action,4
4,Gladiator II,2h 28m,6.5,270K,Action,5
...,...,...,...,...,...,...
7967,Mind and Distortion,57m,,,Thriller,2245
7968,Yean Yeathuku Yeppadi from Dishoreabinash,49m,,,Thriller,2246
7969,The Birth,,,,Thriller,2247
7970,Seclusion,18m,,,Thriller,2248


In [9]:
# ---------------- STEP 5: Convert Duration → Minutes ----------------
def convert_duration(x):
    if pd.isna(x): 
        return None
    hours = re.search(r"(\d+)h", str(x))
    mins = re.search(r"(\d+)m", str(x))
    total = 0
    if hours:
        total += int(hours.group(1)) * 60
    if mins:
        total += int(mins.group(1))
    return total if total > 0 else None

if "duration" in df.columns:
    df["duration"] = df["duration"].apply(convert_duration)


In [10]:
df

Unnamed: 0,title,duration,rating,voters,genre,sl_no
0,Mahavatar Narsimha,130.0,9.2,32K,Action,1
1,Relay,112.0,7.3,1.3K,Action,2
2,Freaky Tales,107.0,6.3,11K,Action,3
3,Dune: Part Two,166.0,8.5,660K,Action,4
4,Gladiator II,148.0,6.5,270K,Action,5
...,...,...,...,...,...,...
7967,Mind and Distortion,57.0,,,Thriller,2245
7968,Yean Yeathuku Yeppadi from Dishoreabinash,49.0,,,Thriller,2246
7969,The Birth,,,,Thriller,2247
7970,Seclusion,18.0,,,Thriller,2248


In [11]:
# ---------------- STEP 6: Convert Voters → Numeric ----------------
def convert_voters(x):
    if pd.isna(x):
        return None
    # Clean input: lowercase, remove dots & extra spaces
    x = str(x).lower().replace(".", "").strip()
    
    if "k" in x:
        return float(x.replace("k", "")) * 1000
    elif "m" in x:
        return float(x.replace("m", "")) * 1000000
    else:
        try:
            return float(x)
        except:
            return None

if "voters" in df.columns:
    df["voters"] = df["voters"].apply(convert_voters)
    df["voters"] = df["voters"].fillna(df["voters"].median())

In [12]:
df

Unnamed: 0,title,duration,rating,voters,genre,sl_no
0,Mahavatar Narsimha,130.0,9.2,32000.0,Action,1
1,Relay,112.0,7.3,13000.0,Action,2
2,Freaky Tales,107.0,6.3,11000.0,Action,3
3,Dune: Part Two,166.0,8.5,660000.0,Action,4
4,Gladiator II,148.0,6.5,270000.0,Action,5
...,...,...,...,...,...,...
7967,Mind and Distortion,57.0,,138.0,Thriller,2245
7968,Yean Yeathuku Yeppadi from Dishoreabinash,49.0,,138.0,Thriller,2246
7969,The Birth,,,138.0,Thriller,2247
7970,Seclusion,18.0,,138.0,Thriller,2248


In [13]:
df.dtypes

title        object
duration    float64
rating      float64
voters      float64
genre        object
sl_no         int64
dtype: object

In [14]:
df.isna().sum()

title          0
duration    1530
rating      2271
voters         0
genre          0
sl_no          0
dtype: int64

In [15]:
# ---------------- STEP 3: Handle Nulls (Median) ----------------
df.fillna(
    {
        "rating": df["rating"].median(),
        "voters": df["voters"].median(),
        "duration": df["duration"].median()
    },
    inplace=True
)



In [16]:
df

Unnamed: 0,title,duration,rating,voters,genre,sl_no
0,Mahavatar Narsimha,130.0,9.2,32000.0,Action,1
1,Relay,112.0,7.3,13000.0,Action,2
2,Freaky Tales,107.0,6.3,11000.0,Action,3
3,Dune: Part Two,166.0,8.5,660000.0,Action,4
4,Gladiator II,148.0,6.5,270000.0,Action,5
...,...,...,...,...,...,...
7967,Mind and Distortion,57.0,5.9,138.0,Thriller,2245
7968,Yean Yeathuku Yeppadi from Dishoreabinash,49.0,5.9,138.0,Thriller,2246
7969,The Birth,95.0,5.9,138.0,Thriller,2247
7970,Seclusion,18.0,5.9,138.0,Thriller,2248


In [17]:
df.dtypes

title        object
duration    float64
rating      float64
voters      float64
genre        object
sl_no         int64
dtype: object

In [18]:
# ---------------- STEP 8: Add Auto ID + Reorder ----------------
df.reset_index(drop=True, inplace=True)
df.insert(0, "id", df.index + 1)   # add id column at front

desired_order = ["id", "sl_no", "title", "duration", "rating", "voters", "genre"]
df = df[[col for col in desired_order if col in df.columns]]

In [19]:
# Drop sl_no if you don't need it
if "sl_no" in df.columns:
    df.drop(columns=["sl_no"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["sl_no"], inplace=True)


In [20]:
df

Unnamed: 0,id,title,duration,rating,voters,genre
0,1,Mahavatar Narsimha,130.0,9.2,32000.0,Action
1,2,Relay,112.0,7.3,13000.0,Action
2,3,Freaky Tales,107.0,6.3,11000.0,Action
3,4,Dune: Part Two,166.0,8.5,660000.0,Action
4,5,Gladiator II,148.0,6.5,270000.0,Action
...,...,...,...,...,...,...
7967,7968,Mind and Distortion,57.0,5.9,138.0,Thriller
7968,7969,Yean Yeathuku Yeppadi from Dishoreabinash,49.0,5.9,138.0,Thriller
7969,7970,The Birth,95.0,5.9,138.0,Thriller
7970,7971,Seclusion,18.0,5.9,138.0,Thriller


In [21]:
pip install sqlalchemy
pip install tabulate
pip install pandas


SyntaxError: invalid syntax (1037902045.py, line 1)

In [23]:
import pymysql

# --- TiDB Connection Details ---
user = "2JRRhPHCS6mRsGW.root"
password = "mt2YyedoEbMz4suV"
host = "gateway01.ap-southeast-1.prod.aws.tidbcloud.com"
port = 4000

# Step 1: Connect without selecting a database
conn = pymysql.connect(
    host=host,
    user=user,
    password=password,
    port=port,
    ssl={"ssl_verify_cert": True, "ssl_verify_identity": True}
)

cur = conn.cursor()
cur.execute("CREATE DATABASE IF NOT EXISTS imdb;")
print("✅ Database `imdb` created (or already exists).")

cur.close()
conn.close()


✅ Database `imdb` created (or already exists).


In [24]:
from sqlalchemy import create_engine
import pandas as pd

database = "imdb"

engine = create_engine(
    f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}",
    connect_args={
        "ssl": {
            "ssl_verify_cert": True,
            "ssl_verify_identity": True
        }
    }
)

# Save DataFrame to TiDB
df.to_sql(
    "imdb_movies",
    con=engine,
    if_exists="replace",
    index=False
)

print("🎉 DataFrame stored in TiDB -> imdb.imdb_movies")


🎉 DataFrame stored in TiDB -> imdb.imdb_movies


In [28]:
df.to_sql("imdb_movies", con=engine, index=False, if_exists="replace")
df.to_csv("imdb_movies", index=False)
