In [1]:
# import dependecies 
import pandas as pd

In [2]:
# upload file
file_path = "iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
# drop class column becuase unspervised learning only deals with numerical data 
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
# reorder the columns using reindex method
new_order = ['sepal_length', 'petal_length', 'sepal_width', 'petal_width']
new_iris_df = new_iris_df.reindex(columns=new_order)

# print the resulting DataFrame
new_iris_df.head()


Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [9]:
# preprocessed DataFrame is saved on a new CSV file for future use
output_file_path = "new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

In [None]:
# Questions to consider
## What knowledge do we hope to glean from running an unsupervised learning model on this dataset?
## What data is available? What type? What is missing? What can be removed?
## Is the data in a format that can be passed into an unsupervised learning model?
## Can I quickly hand off this data for others to use?

In [10]:
# Load Data for shopping_data.csv
file_path = "shopping_data.csv"
df_shopping = pd.read_csv(file_path, encoding="ISO-8859-1")
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [11]:
# Columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [12]:
# List dataframe data types
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [13]:
# check for missing values
missing_values = df_shopping.isna().sum()

# print the number of missing values in each column
print(missing_values)

CustomerID                0
Card Member               2
Age                       2
Annual Income             0
Spending Score (1-100)    1
dtype: int64


In [14]:
# another apprach to find the missing values
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [15]:
# Drop null rows
df_shopping = df_shopping.dropna()

In [16]:
# check for missing values
missing_values = df_shopping.isna().sum()

# print the number of missing values in each column
print(missing_values)

CustomerID                0
Card Member               0
Age                       0
Annual Income             0
Spending Score (1-100)    0
dtype: int64


In [17]:
# Find duplicate entries
print(f"Duplicate entires: {df_shopping.duplicated().sum()}")

Duplicate entires: 0


In [22]:
# Removed the CustomerID Column which dosen't offer any insight into customer shopping habits 
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

KeyError: "['CustomerID'] not found in axis"

In [23]:
# double checking df_shopping
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [24]:
# Transform String column to numerica with Yes as 1 and No as 0 
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0

df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [25]:
# Scale Annual Income column by dividing by 1,000 
df_shopping["Annual Income"] = df_shopping["Annual Income"]/1000
df_shopping.head()


Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [26]:
# Reformat the names of the columns so they contain no spaces or numbers.
# rename the columns to remove spaces and numbers
new_names = {
    'Card Member': 'CardMember',
    'Age': 'Age',
    'Annual Income': 'AnnualIncome',
    'Spending Score (1-100)': 'SpendingScore'
}
df_shopping = df_shopping.rename(columns=new_names)

# print the resulting DataFrame
print(df_shopping.head())

   CardMember   Age  AnnualIncome  SpendingScore
0           1  19.0          15.0           39.0
1           1  21.0          15.0           81.0
2           0  20.0          16.0            6.0
3           0  23.0          16.0           77.0
4           0  31.0          17.0           40.0
