In [49]:
import pandas as pd

df = pd.read_csv('flipkart_laptop.csv')

df.head()


Unnamed: 0,Product_name,Prices,Description,Reviews
0,DELL 15 AMD Ryzen 5 Hexa Core 7530U - (16 GB/5...,"₹41,990",AMD Ryzen 5 Hexa Core Processor16 GB DDR4 RAMW...,4.2
1,HP 15 (2024) AMD Ryzen 3 Quad Core 7320U - (8 ...,"₹29,990",AMD Ryzen 3 Quad Core Processor8 GB DDR4 RAMWi...,4.2
2,ASUS Vivobook 15 Intel Core i3 13th Gen 1315U ...,"₹35,990",Intel Core i3 Processor (13th Gen)8 GB DDR4 RA...,4.5
3,SAMSUNG Galaxy Book4 Metal Intel Core i5 13th ...,"₹51,990",Intel Core i5 Processor (13th Gen)16 GB LPDDR4...,4.2
4,ASUS Vivobook 15 Intel Core i5 12th Gen 1235U ...,"₹40,990",Intel Core i5 Processor (12th Gen)8 GB DDR4 RA...,4.2


In [50]:
#missing value
df.isnull().sum()

Product_name      0
Prices            0
Description       0
Reviews         123
dtype: int64

In [51]:
#handling missing values
df['Reviews'] = df['Reviews'].fillna('N/A')  

In [52]:
df.isnull().sum()

Product_name    0
Prices          0
Description     0
Reviews         0
dtype: int64

In [53]:
df.shape

(984, 4)

In [54]:
# Step 4: Remove duplicate rows
df = df.drop_duplicates()

In [55]:
df.shape

(705, 4)

In [56]:
# Step 5: Check and convert data types 

df['Prices'] = df['Prices'].replace({'₹': '', ',': ''}, regex=True).astype(float)

In [57]:
df.head()

Unnamed: 0,Product_name,Prices,Description,Reviews
0,DELL 15 AMD Ryzen 5 Hexa Core 7530U - (16 GB/5...,41990.0,AMD Ryzen 5 Hexa Core Processor16 GB DDR4 RAMW...,4.2
1,HP 15 (2024) AMD Ryzen 3 Quad Core 7320U - (8 ...,29990.0,AMD Ryzen 3 Quad Core Processor8 GB DDR4 RAMWi...,4.2
2,ASUS Vivobook 15 Intel Core i3 13th Gen 1315U ...,35990.0,Intel Core i3 Processor (13th Gen)8 GB DDR4 RA...,4.5
3,SAMSUNG Galaxy Book4 Metal Intel Core i5 13th ...,51990.0,Intel Core i5 Processor (13th Gen)16 GB LPDDR4...,4.2
4,ASUS Vivobook 15 Intel Core i5 12th Gen 1235U ...,40990.0,Intel Core i5 Processor (12th Gen)8 GB DDR4 RA...,4.2


In [58]:
# Step 6: Clean any inconsistent formatting (e.g., remove extra spaces)
df['Product_name'] = df['Product_name'].str.strip()

In [59]:
# Step 7: Preview the cleaned data
print("\nCleaned Data:")
df.head()



Cleaned Data:


Unnamed: 0,Product_name,Prices,Description,Reviews
0,DELL 15 AMD Ryzen 5 Hexa Core 7530U - (16 GB/5...,41990.0,AMD Ryzen 5 Hexa Core Processor16 GB DDR4 RAMW...,4.2
1,HP 15 (2024) AMD Ryzen 3 Quad Core 7320U - (8 ...,29990.0,AMD Ryzen 3 Quad Core Processor8 GB DDR4 RAMWi...,4.2
2,ASUS Vivobook 15 Intel Core i3 13th Gen 1315U ...,35990.0,Intel Core i3 Processor (13th Gen)8 GB DDR4 RA...,4.5
3,SAMSUNG Galaxy Book4 Metal Intel Core i5 13th ...,51990.0,Intel Core i5 Processor (13th Gen)16 GB LPDDR4...,4.2
4,ASUS Vivobook 15 Intel Core i5 12th Gen 1235U ...,40990.0,Intel Core i5 Processor (12th Gen)8 GB DDR4 RA...,4.2


In [60]:
# Save the cleaned data to a new CSV file
df.to_csv('flipkart_laptop_cleaned.csv', index=False)

In [61]:
df['Product_name'][0]

'DELL 15 AMD Ryzen 5 Hexa Core 7530U - (16 GB/512 GB SSD/Windows 11 Home) 3535 Thin and Light Laptop'

In [62]:
df['Description'][0]

'AMD Ryzen 5 Hexa Core Processor16 GB DDR4 RAMWindows 11 Operating System512 GB SSD39.62 cm (15.6 Inch) DisplayMy Dell (Dell Power Manager,Dell Support Assist)1 Year Onsite Warranty'

In [63]:
import pandas as pd
import re

# ✅ Step 1: Load dataset
df = pd.read_csv('flipkart_laptop_cleaned.csv')

# ✅ Step 2: Clean numeric columns (don't lower-case Description!)
df['Prices'] = pd.to_numeric(df['Prices'], errors='coerce')
df['Description'] = df['Description'].astype(str)  # Keep original casing

# ✅ Step 3: Extract structured specs from the Description
def extract_and_clean_specs(description):
    specs = {}

    # CPU (case-insensitive)
    cpu_match = re.search(r'^(.*?Processor)', description, re.IGNORECASE)
    specs['CPU'] = cpu_match.group(1).replace('Processor', '').strip() if cpu_match else "Unknown"

    # RAM
    ram_match = re.search(r'(\d+\s?GB).*?RAM', description, re.IGNORECASE)
    specs['RAM'] = ram_match.group(1).strip() if ram_match else "Unknown"

    # OS
    os_match = re.search(r'(Windows \d+)', description, re.IGNORECASE)
    specs['OS'] = os_match.group(1).strip() if os_match else "Not Specified"

    # Storage
    storage_match = re.search(r'(\d+\s?GB.*?)SSD', description, re.IGNORECASE)
    specs['Storage'] = storage_match.group(1).strip() + " SSD" if storage_match else "Not Specified"

    # Display
    display_match = re.search(r'(\d+(\.\d+)?\s?cm).*?Display', description, re.IGNORECASE)
    specs['Display'] = display_match.group(1).strip() if display_match else "Unknown"

    # Warranty
    warranty_match = re.search(r'(\d+\s?Year.*?)Warranty', description, re.IGNORECASE)
    specs['Warranty'] = warranty_match.group(1).replace('Onsite', '').strip() if warranty_match else "No Warranty"

    return pd.Series(specs)

# ✅ Step 4: Apply spec extraction to all rows
df_specs = df['Description'].apply(extract_and_clean_specs)

# ✅ Step 5: Combine with original DataFrame
df_cleaned = pd.concat([df, df_specs], axis=1)

# ✅ Step 6: Fill any missing values
df_cleaned.fillna("Unknown", inplace=True)

df_cleaned.head()

  df_cleaned.fillna("Unknown", inplace=True)


Unnamed: 0,Product_name,Prices,Description,Reviews,CPU,RAM,OS,Storage,Display,Warranty
0,DELL 15 AMD Ryzen 5 Hexa Core 7530U - (16 GB/5...,41990.0,AMD Ryzen 5 Hexa Core Processor16 GB DDR4 RAMW...,4.2,AMD Ryzen 5 Hexa Core,16 GB,Windows 11,16 GB DDR4 RAMWindows 11 Operating System512 G...,39.62 cm,1 Year
1,HP 15 (2024) AMD Ryzen 3 Quad Core 7320U - (8 ...,29990.0,AMD Ryzen 3 Quad Core Processor8 GB DDR4 RAMWi...,4.2,AMD Ryzen 3 Quad Core,8 GB,Windows 11,8 GB DDR4 RAMWindows 11 Operating System512 GB...,39.62 cm,1 Year
2,ASUS Vivobook 15 Intel Core i3 13th Gen 1315U ...,35990.0,Intel Core i3 Processor (13th Gen)8 GB DDR4 RA...,4.5,Intel Core i3,8 GB,Windows 11,8 GB DDR4 RAMWindows 11 Operating System512 GB...,39.62 cm,1 Year
3,SAMSUNG Galaxy Book4 Metal Intel Core i5 13th ...,51990.0,Intel Core i5 Processor (13th Gen)16 GB LPDDR4...,4.2,Intel Core i5,16 GB,Windows 11,16 GB LPDDR4X RAMWindows 11 Operating System51...,39.62 cm,1 Year Manufacturer
4,ASUS Vivobook 15 Intel Core i5 12th Gen 1235U ...,40990.0,Intel Core i5 Processor (12th Gen)8 GB DDR4 RA...,4.2,Intel Core i5,8 GB,Windows 11,8 GB DDR4 RAMWindows 11 Operating System512 GB...,39.62 cm,1 Year


In [64]:
# ✅ Step 7: Save cleaned and structured CSV
df_cleaned.to_csv('flipkart_laptop_cleaned_structured.csv', index=False)
print("✅ Cleaned and saved as 'flipkart_laptop_cleaned_structured.csv'")

✅ Cleaned and saved as 'flipkart_laptop_cleaned_structured.csv'
