In [52]:
import pandas as pd
import os

In [53]:
# File path
file_path = os.path.join('Resources', 'iris.csv')

In [54]:
# Create df with csv data
iris_df = pd.read_csv(file_path)

In [55]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [56]:
# Drop class column
new_iris_df =  iris_df.drop("class", axis=1)

In [57]:
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [58]:
# Reorder columns
new_iris_df = new_iris_df[['sepal_length', 'petal_length', 'sepal_width', 'petal_width']]

In [59]:
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [60]:
# Save new df to dataframe
output_file_path = os.path.join('Resources', 'new_iris_data.csv')
new_iris_df.to_csv(output_file_path, index=False)

In [61]:
# File path
cleaned_file_path = os.path.join('Resources', 'shopping_data.csv')

In [62]:
# Read csv
df_shopping = pd.read_csv(cleaned_file_path, encoding= 'ISO-8859-1')

In [63]:
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


## Data Selection

In [64]:
# Check columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [65]:
# Get the number of null values in each row of the dataset
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values.")

Column CustomerID has 0 null values.
Column Card Member has 2 null values.
Column Age has 2 null values.
Column Annual Income has 0 null values.
Column Spending Score (1-100) has 1 null values.


In [66]:
# Drop na values
df_shopping = df_shopping.dropna()

In [67]:
# Check for duplicates
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [68]:
# Drop unnecessary column
df_shopping = df_shopping.drop("CustomerID", axis=1)

In [69]:
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


### Data Processing

In [70]:
# Transform String Column
def change_string(member):
    if member =="Yes":
        return 1
    else: 
        return 0
df_shopping['Card Member'] = df_shopping['Card Member'].apply(change_string)

In [71]:
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [73]:
# Transform the annual income
df_shopping["Annual Income"] = df_shopping["Annual Income"] / 1000

In [74]:
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [84]:
df_shopping.columns = ["Card_Member", "Age", "Annual_Income", "Spending_Score"]

In [85]:
df_shopping.head()

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


### Data Transformation

In [87]:
# Convert to a common data type for others to use
file_path = os.path.join('Resources', 'shopping_data_cleaned.csv')

In [88]:
df_shopping.to_csv(file_path)