In [1]:
import pandas as pd

In [2]:
file_path = "iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [7]:
#new_iris_df.order['sepal_length','petal_length','sepal_width','petal_width']
cols = ['sepal_length','petal_length','sepal_width','petal_width']
new_iris_df = new_iris_df[cols]

In [8]:
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [9]:
output_file_path = "new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

## ----------


In [29]:
file_path = "shopping_data.csv"
df_shopping = pd.read_csv(file_path, encoding='ISO-8859-1')
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [30]:
df_shopping.columns


Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [31]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

### Unsupervised learning models can't handle missing data. If you try to run a model on a dataset with missing data, you'll get an error such as the one below:

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Pandas has the isnull() method ## to check for missing values. We'll loop through each column, check if there are null values, sum them up, and print out a readable total:

In [32]:
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


### Rows of data with null values can be removed with the dropna() method, as shown below:

In [33]:
# Drop null rows
df_shopping = df_shopping.dropna()

#### Duplicates can also be removed.
Use the duplicated().sum() method to check for duplicates, as shown below:

In [34]:
print(f"{df_shopping.duplicated().sum()} duplicates exist")

0 duplicates exist


In [35]:
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [36]:
df_shopping.drop(columns=['CustomerID'], inplace=True)

In [37]:
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


To make sure we can use our string data, we'll transform our strings of Yes and No from the Card Member column to 1 and 0, respectively, by creating a function that will convert Yes to a 1 and anything else to 0.

The function will then be run on the whole column with the .apply method, as shown below:

In [38]:
#transform string to a number "card Member" column
def change_string(member):
    if member =="Yes":
        return 1
    else:
        return 0

df_shopping['Card Member'] = df_shopping['Card Member'].apply(change_string)
df_shopping.head()
    

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


The scale for Annual Income is much larger than all the other values in the dataset. We can adjust this format by dividing by 1,000 to rescale those data points

In [39]:
#scale down annual income to match other data points in the dataset
df_shopping['Annual Income'] = df_shopping['Annual Income'] /1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


#### Renaming the columns


In [44]:
df_shopping.rename(columns={'Annual Income':'Annual_Income','Spending Score (1-100)':'Spending_Score'}, inplace=True) 
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


#### Saving the cleaned file


In [45]:
# Saving cleaned data
file_path = "<shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)