In [1]:
# !pip install pandas

Pandas Documentation: https://pandas.pydata.org/docs/

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Dataframe: Pandas Object to represent tabular(structured) data

# Create DataFrame

### From Dictionary

In [3]:
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Aisha'],
    'Age': [25, 30, 28, np.nan, 28],
    'Department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'],
    'Salary': [50000, 65000, 60000, 72000, 80000]
}
df_hr = pd.DataFrame(data)
rows, cols = df_hr.shape
print(f"Rows: {rows}, Columns: {cols}")
df_hr

Rows: 5, Columns: 5


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25.0,HR,50000
1,2,Bob,30.0,IT,65000
2,3,Charlie,28.0,Marketing,60000
3,4,Diana,,Finance,72000
4,5,Aisha,28.0,IT,80000


In [4]:
df_hr.columns # Shows all columns
# Accessing a Particular column:
# Use df_hr.Name or df_hr['Name']

# Accessing a group of selected columns:
# df_hr[['Name', 'Salary']]

Index(['ID', 'Name', 'Age', 'Department', 'Salary'], dtype='object')

In [5]:
# Important pandas datatypes:
type(df_hr['Name']), type(df_hr[['Name', 'Salary']])

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [6]:
df_hr.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
df_hr.set_index('Department', inplace=True)
df_hr

Unnamed: 0_level_0,ID,Name,Age,Salary
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HR,1,Alice,25.0,50000
IT,2,Bob,30.0,65000
Marketing,3,Charlie,28.0,60000
Finance,4,Diana,,72000
IT,5,Aisha,28.0,80000


In [8]:
df_hr.loc['IT']

Unnamed: 0_level_0,ID,Name,Age,Salary
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IT,2,Bob,30.0,65000
IT,5,Aisha,28.0,80000


In [9]:
df_hr.reset_index(inplace=True)

In [10]:
df_hr.set_index('ID', inplace=True)
df_hr

Unnamed: 0_level_0,Department,Name,Age,Salary
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,HR,Alice,25.0,50000
2,IT,Bob,30.0,65000
3,Marketing,Charlie,28.0,60000
4,Finance,Diana,,72000
5,IT,Aisha,28.0,80000


In [11]:
df_hr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Department  5 non-null      object 
 1   Name        5 non-null      object 
 2   Age         4 non-null      float64
 3   Salary      5 non-null      int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 200.0+ bytes


In [12]:
df_hr.isna().sum()

Department    0
Name          0
Age           1
Salary        0
dtype: int64

In [13]:
df_hr['Age'] = df_hr['Age'].fillna(np.floor(df_hr['Age'].mean())) # other ways to fill min, max, median etc
df_hr

Unnamed: 0_level_0,Department,Name,Age,Salary
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,HR,Alice,25.0,50000
2,IT,Bob,30.0,65000
3,Marketing,Charlie,28.0,60000
4,Finance,Diana,27.0,72000
5,IT,Aisha,28.0,80000


### From list of tuples

In [14]:
data = [
    (1, 'Alice', 25, 'HR', 50000),
    (2, 'Bob', 30, np.nan, 65000),
    (3, 'Aisha', 28, 'IT', 80000),
]
df = pd.DataFrame(data, columns=['ID', 'Name', 'Age', 'Department', 'Salary'])
rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")
df

Rows: 3, Columns: 5


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,50000
1,2,Bob,30,,65000
2,3,Aisha,28,IT,80000


### From list of Dictionaries

In [15]:
data = [
    {'ID': 1, 'Name': 'Alice', 'Age': 25, 'Department':'HR', 'Salary': 50000},
    {'ID': 2, 'Name': 'Bob', 'Age': 30, 'Department': np.nan, 'Salary': 65000},
    {'ID': 3, 'Name': 'Aisha', 'Age': 28, 'Department': 'IT', 'Salary': 80000},
]
df = pd.DataFrame(data)
rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")
df

Rows: 3, Columns: 5


Unnamed: 0,ID,Name,Age,Department,Salary
0,1,Alice,25,HR,50000
1,2,Bob,30,,65000
2,3,Aisha,28,IT,80000


### Reading from xlsx

In [16]:
# !pip install openpyxl

In [17]:
# Link to the data used for tutorial from Kaggle - https://www.kaggle.com/datasets/himelsarder/retail-product-dataset-with-missing-values/data
df = pd.read_excel("datasets/synthetic_dataset.xlsx", sheet_name="synthetic_dataset")
print(df.shape)
df.sample(2) # Random 2 rows

(4362, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
1345,D,934.0,4.186911,,9.0
2798,,2770.0,,,


In [18]:
df = pd.read_excel("datasets/synthetic_dataset_without_column_names.xlsx", header=None, names=['Category', 'Price', 'Rating', 'Stock', 'Discount'], na_values={'Discount':[-1], 'Price': [-1], 'Category': 'unknown'})

In [19]:
df.head(2)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548,1.870322,,
1,,3045,4.757798,,38.0


In [20]:
# Alternatively use converters
def fix_unknown_or_negative(cell):
    if cell == 'unknown' or cell == -1:
        return np.nan
df_excel = pd.read_excel("datasets/synthetic_dataset_without_column_names.xlsx", header=None, names=['Category', 'Price', 'Rating', 'Stock', 'Discount'], converters={'Discount':fix_unknown_or_negative, 'Price': fix_unknown_or_negative, 'Category': fix_unknown_or_negative})
df_excel.head(2)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,,1.870322,,
1,,,4.757798,,


### Reading from Parquet

In [21]:
# !pip install pyarrow

In [22]:
# !pip install fastparquet

In [23]:
df = pd.read_parquet("datasets/synthetic_dataset.parquet")
print(df.shape)
df.tail(2) # last 2 rows

(4362, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
4360,D,2999.0,4.425995,,40.0
4361,,,2.184471,,0.0


### Reading from CSV

In [24]:
df = pd.read_csv("datasets/Synthetic_dataset_multiple_headers.csv", header=2, skiprows=0, nrows=2, usecols=["Price", "Discount"])
# if we have two header and we want to consider the 2nd row as column names set header=1 (here, consider 2nd row and not 0th and 1st row for column names)
# or skiprows=2 (skip top 2 rows, and start from the next one). We can skip any no. of rows
# read only 2 rows
# read only Price and Discount columns
df

Unnamed: 0,Price,Discount
0,5548,0
1,3045,38


In [25]:
df = pd.read_csv("datasets/synthetic_dataset_without_column_names.csv", header=None, names=['Category', 'Price', 'Rating', 'Stock', 'Discount'], na_values={'Discount':[-1], 'Price': [-1]})
# if we our data has specific encoding ex: encoding='latin1', 'utf-8' add it
# if we have na_values in a particular column written differently like na_values={'Discount':[-1]} i.e. considering '-1' as NA value for Discount. 
# we cannot have Discount < 0
print(f"Shape: {df.shape}")
# Last 5 rows: df.tail()
# Random 5 rows: df.sample(5)
# Selected indexes: df[5:10]
# First 5 rows
df.head()

Shape: (7, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548,1.870322,,
1,,3045,4.757798,,38.0
2,,4004,,In Stock,
3,,4808,1.492085,,33.0
4,,1817,,Out of Stock,23.0


### Understand the data

The dataset includes:

Category (Categorical): Product category (A, B, C, D)

Price (Numerical): Randomized product prices

Rating (Numerical): Ratings between 1 to 5

Stock (Categorical): Availability status (In Stock, Out of Stock)

Discount (Numerical): Discount percentage

In [26]:
df = pd.read_csv("datasets/synthetic_dataset.csv")
print(df.shape)
df.head()

(4362, 5)


Unnamed: 0,Category,Price,Rating,Stock,Discount
0,,5548.0,1.870322,,0.0
1,,3045.0,4.757798,,38.0
2,,4004.0,,In Stock,0.0
3,,4808.0,1.492085,,33.0
4,,1817.0,,Out of Stock,23.0


# Indexing in Pandas

In [27]:
# Column selection (dict-style) - Returns a Series.
df['Category'].head(2) # or df.Category

0    NaN
1    NaN
Name: Category, dtype: object

In [28]:
# Label-based access - Works for both row and column selection.
print(df.loc[2]) # all values in 2nd row
print("****************************************")
print(df.loc[2, 'Price']) # 2nd Row Price column
print("****************************************")
print(df.loc[1:3, ['Price', 'Rating']]) # Row 1-3, Price and Rating column
print("****************************************")
print(df.loc[1:3, :]) # Row 1-3, all column
print("****************************************")

Category         NaN
Price         4004.0
Rating           NaN
Stock       In Stock
Discount         0.0
Name: 2, dtype: object
****************************************
4004.0
****************************************
    Price    Rating
1  3045.0  4.757798
2  4004.0       NaN
3  4808.0  1.492085
****************************************
  Category   Price    Rating     Stock  Discount
1      NaN  3045.0  4.757798       NaN      38.0
2      NaN  4004.0       NaN  In Stock       0.0
3      NaN  4808.0  1.492085       NaN      33.0
****************************************


In [29]:
# Integer position-based access - iloc doesnot takes the last item in the range
print(df.iloc[2]) # all values in 2nd row
print("****************************************")
print(df.iloc[2, 1]) # 2nd Row Price column
print("****************************************")
print(df.iloc[1:3, 1:3]) # Row 1-2, Price and Rating colum
print("****************************************")
print(df.iloc[1:3, :]) # Row 1-2, all columns

Category         NaN
Price         4004.0
Rating           NaN
Stock       In Stock
Discount         0.0
Name: 2, dtype: object
****************************************
4004.0
****************************************
    Price    Rating
1  3045.0  4.757798
2  4004.0       NaN
****************************************
  Category   Price    Rating     Stock  Discount
1      NaN  3045.0  4.757798       NaN      38.0
2      NaN  4004.0       NaN  In Stock       0.0


In [30]:
# Boolean indexing
print(df[df['Discount']>48][:2]) # first 2 rows with discount > 48
print("****************************************")
print(df.loc[(df['Discount']>48) & (df['Rating']>4)][:2]) # first 2 rows with discount > 48 and rating > 4

   Category   Price    Rating     Stock  Discount
74      NaN   859.0  1.227946       NaN      49.0
84      NaN  6884.0       NaN  In Stock      49.0
****************************************
    Category   Price    Rating         Stock  Discount
115        C  3182.0  4.874852      In Stock      49.0
488      NaN  2527.0  4.841732  Out of Stock      49.0


In [31]:
# Fast scalar access -  output is single value
print(df.at[2, 'Price'])
# print(df.at[2, ['Price', 'Rating']]) # Error: because it is only for scalar o/p, so it can take a list

4004.0


# Data wrangling/ Data mungling -  Process of cleaning messy data

### Duplicates

In [32]:
print(df.loc[df.duplicated(keep=False)]) # View all duplicates (including the first occurrence)
# subset=['Rating', 'Price'] subset is used to view duplicates on specific columns

     Category  Price  Rating         Stock  Discount
32        NaN    NaN     NaN           NaN       NaN
102       NaN    NaN     NaN      In Stock       1.0
185       NaN    NaN     NaN           NaN       0.0
505       NaN    NaN     NaN           NaN       NaN
587       NaN    NaN     NaN      In Stock      44.0
1210      NaN    NaN     NaN  Out of Stock      16.0
1596      NaN    NaN     NaN      In Stock      46.0
1746      NaN    NaN     NaN      In Stock      44.0
1760      NaN    NaN     NaN      In Stock      44.0
1981      NaN    NaN     NaN  Out of Stock      28.0
1995      NaN    NaN     NaN  Out of Stock       NaN
2051      NaN    NaN     NaN           NaN       NaN
2124      NaN    NaN     NaN  Out of Stock       NaN
2241      NaN    NaN     NaN  Out of Stock       NaN
2784      NaN    NaN     NaN           NaN      42.0
3302      NaN    NaN     NaN      In Stock       1.0
3314      NaN    NaN     NaN  Out of Stock      16.0
3321      NaN    NaN     NaN  Out of Stock    

In [33]:
new_df = df.drop_duplicates(keep='last') #  default keep ='first', subset=['column1', 'column2'] drop specific to subset column
print(new_df.shape)
print(new_df[new_df.duplicated()])

(4347, 5)
Empty DataFrame
Columns: [Category, Price, Rating, Stock, Discount]
Index: []


In [34]:
df.isna().sum()

Category    2748
Price        174
Rating      2050
Stock       1352
Discount     392
dtype: int64

##### Drop NA

In [35]:
print(f"Original: {df.shape}")
df_cleaned = df.dropna(how='all') # when all values in a row are missing
print(f"All empty row: {df_cleaned.shape}")
# or
df_cleaned = df.dropna(thresh=1) # Keep the row with atleast 1 non na value
print(f"Keep the row with atleast 1 non na value: {df_cleaned.shape}")

df_cleaned = df.dropna(thresh=2) # Keep the row with atleast 2 non na value
print(f"Keep the row with atleast 2 non na value: {df_cleaned.shape}")

df_cleaned = df.dropna(how='any') # Drop if any NaN (default)
print(f"Any empty row: {df_cleaned.shape}")

df_cleaned = df.dropna(subset=['Price', 'Discount']) # Only consider these columns for NaN checking
print(f"Price/Discount missing: {df_cleaned.shape}")

Original: (4362, 5)
All empty row: (4359, 5)
Keep the row with atleast 1 non na value: (4359, 5)
Keep the row with atleast 2 non na value: (4306, 5)
Any empty row: (540, 5)
Price/Discount missing: (3809, 5)


##### Fill NA

In [36]:
df = df.sort_values(by=["Category"])
df = df.reset_index(drop=True)
df

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,
2,A,4865.0,,,41.0
3,A,1814.0,,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0
...,...,...,...,...,...
4357,,,,In Stock,46.0
4358,,,,Out of Stock,21.0
4359,,4436.0,4.728335,,49.0
4360,,3283.0,,Out of Stock,9.0


In [37]:
df_cleaned = df.fillna(0)
df_cleaned.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,0.0,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,0.0
2,A,4865.0,0.0,0,41.0
3,A,1814.0,0.0,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0


In [38]:
#ffill
df_cleaned = df.fillna(method='ffill', limit=1) # limit=1 only ffill to the next column. limit=2, ffill to next two columns
df_cleaned.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,9.0
2,A,4865.0,4.690405,Out of Stock,41.0
3,A,1814.0,,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0


In [39]:
df_cleaned = df.copy()
df_cleaned['Discount'] = df_cleaned.groupby(['Category'])['Discount'].fillna(method='ffill')
df_cleaned.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,9.0
2,A,4865.0,,,41.0
3,A,1814.0,,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0


In [40]:
#bfill
df_cleaned = df.copy()
df_cleaned['Rating'] = df_cleaned.groupby(['Category'])['Rating'].fillna(method='bfill')
df_cleaned.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,4.690405,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,
2,A,4865.0,3.796978,,41.0
3,A,1814.0,3.796978,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0


In [41]:
df_cleaned = df.fillna({'Price': df['Price'].median(),
                        'Discount': df['Discount'].median(),
                        'Rating': df['Rating'].median(),
                        'Stock': 'No information'})
df_cleaned.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,3.08206,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,25.0
2,A,4865.0,3.08206,No information,41.0
3,A,1814.0,3.08206,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0


In [42]:
df_cleaned = df.copy()
df_cleaned[['Stock', 'Category']]=df_cleaned[['Stock', 'Category']].fillna('No information')
df_cleaned = df_cleaned.groupby('Category').apply(
    lambda group: group.assign(
        Price=group['Price'].fillna(group['Price'].median()),
        Discount=group['Discount'].fillna(group['Discount'].median()),
        Rating=group['Rating'].fillna(group['Rating'].median()),
    )
).reset_index(drop=True)
df_cleaned

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,3.029522,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,26.0
2,A,4865.0,3.029522,No information,41.0
3,A,1814.0,3.029522,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0
...,...,...,...,...,...
4357,No information,4962.0,3.100418,In Stock,46.0
4358,No information,4962.0,3.100418,Out of Stock,21.0
4359,No information,4436.0,4.728335,No information,49.0
4360,No information,3283.0,3.100418,Out of Stock,9.0


In [43]:
# or
df_cleaned = df.copy()
df_cleaned[['Stock', 'Category']]=df_cleaned[['Stock', 'Category']].fillna('No information')
cols_to_fill = ['Price', 'Discount', 'Rating']
for cols in cols_to_fill:
    df_cleaned[cols] = df_cleaned[cols].fillna(df_cleaned.groupby('Category')[cols].transform('median'))
df_cleaned

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,3.029522,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,26.0
2,A,4865.0,3.029522,No information,41.0
3,A,1814.0,3.029522,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0
...,...,...,...,...,...
4357,No information,4962.0,3.100418,In Stock,46.0
4358,No information,4962.0,3.100418,Out of Stock,21.0
4359,No information,4436.0,4.728335,No information,49.0
4360,No information,3283.0,3.100418,Out of Stock,9.0


In [44]:
# interpolate
df_cleaned = df.copy()
df_cleaned['Rating'] = df_cleaned.groupby('Category')['Rating'].apply(lambda x: x.interpolate()).reset_index(level=0, drop=True) # default - linear
df_cleaned.head() # Gradually decreased the rating from 4.69 to 3.79 (linear transformation)

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,,Out of Stock,9.0
1,A,3424.0,4.690405,Out of Stock,
2,A,4865.0,4.392596,,41.0
3,A,1814.0,4.094787,In Stock,32.0
4,A,3592.0,3.796978,Out of Stock,35.0


In [45]:
# replace - Usage1. Mapping Categorical cols
df_replaced = df.replace(['In Stock', 'Out of Stock', np.nan], [1, 2, 0])
df_replaced.head()

Unnamed: 0,Category,Price,Rating,Stock,Discount
0,A,6649.0,0.0,2.0,9.0
1,A,3424.0,4.690405,2.0,0.0
2,A,4865.0,0.0,0.0,41.0
3,A,1814.0,0.0,1.0,32.0
4,A,3592.0,3.796978,2.0,35.0


In [46]:
# replace - Usage2. Specific Outliers
data = {
    'id': [1, 2, 3, 4, 5],
    'age': [25, '27 years', '26 yr', 102, 24],
    'income': [50000, 52000, 51000, -1111111, -999999],
    'department': ['HR', 'Finance', 'IT', 'No information', 'Executive']
}
df_replaced = pd.DataFrame(data)
df_replaced

Unnamed: 0,id,age,income,department
0,1,25,50000,HR
1,2,27 years,52000,Finance
2,3,26 yr,51000,IT
3,4,102,-1111111,No information
4,5,24,-999999,Executive


In [47]:
df_replaced1 = df_replaced.replace({'income': [-999999, -1111111], 'department':'No information'}, np.nan)
df_replaced1

Unnamed: 0,id,age,income,department
0,1,25,50000.0,HR
1,2,27 years,52000.0,Finance
2,3,26 yr,51000.0,IT
3,4,102,,
4,5,24,,Executive


In [48]:
df_replaced = df_replaced.replace({-999999: np.nan,
                                   -1111111: np.nan,
                                   "No information": np.nan,
                                  })
df_replaced

Unnamed: 0,id,age,income,department
0,1,25,50000.0,HR
1,2,27 years,52000.0,Finance
2,3,26 yr,51000.0,IT
3,4,102,,
4,5,24,,Executive


In [49]:
df_replaced = df_replaced.replace({'age': '[a-zA-Z]'}, '', regex=True)
df_replaced

Unnamed: 0,id,age,income,department
0,1,25,50000.0,HR
1,2,27,52000.0,Finance
2,3,26,51000.0,IT
3,4,102,,
4,5,24,,Executive


# Basic Stats from Data

In [50]:
print(f"Maximum Price: {df['Price'].max()}") 
print(f"Average Rating: {df['Rating'].mean()}") 
print(f"Minimum Discount: {df['Discount'].min()}") 
print(f"Price deviation: {df['Price'].std()}")

Maximum Price: 9999.0
Average Rating: 3.0382925191086185
Minimum Discount: 0.0
Price deviation: 2839.984813424091


In [51]:
df.describe() # for mathematical columns

Unnamed: 0,Price,Rating,Discount
count,4188.0,2312.0,3970.0
mean,5016.97063,3.038293,24.516625
std,2839.984813,1.143074,14.347164
min,102.0,1.000366,0.0
25%,2628.25,2.06949,12.0
50%,4996.5,3.08206,25.0
75%,7418.0,4.00862,37.0
max,9999.0,4.997818,49.0


# Saving final frame

In [52]:
# To parquet
# df.to_parquet("datasets/final_synthetic_dataset.parquet")
# To excel
df.to_excel("datasets/final_synthetic_dataset.xlsx", sheet_name='retail_prices', index=False, startrow=3, startcol=5)
# Write different datasets to multiple sheets
# with pd.ExcelWriter("datasets/final_synthetic_dataset.xlsx") as excel_writer:
#     df_hr.to_excel(excel_writer, sheet_name="hr")
#     df_excel.to_excel(excel_writer, sheet_name="retail_prices")
# To CSV
# df.to_csv("datasets/final_synthetic_dataset.csv", index=False, columns = ['Price', 'Rating']) # saving selected columns only
# df.to_csv("datasets/final_synthetic_dataset.csv", index=False, header=False) # save without column names