# Task 1: Download and import dataset

In [38]:
import pandas as pd

In [39]:
# Load the dataset
df = pd.read_csv('mammographic_masses_data.csv')

# Task 2: Get dataset on screen

In [40]:
# Show the first few rows of the DataFrame
df.head()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [41]:
# Get a concise summary of a DataFrame and the missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BA        959 non-null    float64
 1   Age       956 non-null    float64
 2   Shape     930 non-null    float64
 3   Margin    913 non-null    float64
 4   Density   885 non-null    float64
 5   Severity  961 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 45.2 KB


In [42]:
# Render out summary statistics of the DataFrame
df.describe()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.300313,55.487448,2.721505,2.796276,2.910734,0.463059
std,0.683469,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,6.0,96.0,4.0,5.0,4.0,1.0


In [43]:
# Using loc show the margin attribute of every instance of the data where the severity is 1
df.loc[df['Severity'] == 1, 'Margin']

0      5.0
1      1.0
2      5.0
4      5.0
8      5.0
      ... 
951    5.0
952    4.0
955    4.0
957    5.0
959    5.0
Name: Margin, Length: 445, dtype: float64

In [44]:
import plotly.express as px

In [45]:
# Render out a scatter plot of the data using plotly express with the shape attribute on the x axis and the age attribute on the y axis. Color the points by severity
fig = px.scatter(df, x='Shape', y='Age', color='Severity', title='Scatter plot of Shape vs Age colored by Severity')
fig.show()

# Render out a histogram of the age attribute using plotly express
fig = px.histogram(df, x='Age', title='Histogram of Age')
fig.show()

# Render out a correlation matrix of the dataset using plotly express
corr = df.corr()
fig = px.imshow(corr, text_auto=True, title='Correlation Matrix')
fig.show()

# Task 3: Preprocessing

In [113]:
# use a copy of the dataframe to preprocess the data
df_copy = df.copy()

In [114]:
df_copy = df_copy.dropna(subset=['BA'])

In [115]:
df_copy['Age'].fillna(df_copy['Age'].mean(), inplace=True)

In [116]:
df_copy = df_copy.dropna(subset=['Shape', 'Margin', 'Density'])

In [None]:

# Render plots showing the difference between a cleaned and uncleaned dataset. 
fig1 = px.histogram(df, x='Age', title='Histogram of Age (Original Data)')
fig2 = px.histogram(df_copy, x='Age', title='Histogram of Age (Cleaned Data)')

fig1.show()
fig2.show()

In [118]:
def normalize(column):
    min_val = column.min()
    max_val = column.max()
    normalized_col = (column - min_val) / (max_val - min_val)
    return normalized_col


In [119]:
df_copy_normalized = normalize(df_copy['Age'])

In [121]:
# Render plots showing the difference between a cleaned and uncleaned dataset. 
fig1 = px.histogram(df_copy, x='Age', title='Histogram of Age (Original Data)')
fig2 = px.histogram(df_copy_normalized, x='Age', title='Histogram of Age (Normalized Data)')
fig1.show()
fig2.show()