In [None]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/a-forty-two/EY_batch8_11Nov_AIplusOpenAI/refs/heads/main/data.csv')

In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
# Correlation -> how are variables (COLUMNS OR DIMENSIONS) related to each other?

# if there is a very strong correlation between multiple columns, then they are essentially giving us same information!
# Do we really need all of them or any one of those columns could have done the trick?
all_cols = list(data.columns) [:12] # selecting just the mean columns
all_cols.remove('diagnosis')
cors = data.loc[:, all_cols].corr()
print(cors)

In [None]:
import seaborn as sns
sns.heatmap(cors, cmap='coolwarm')

# Our learning-> because radius, area and perimeter are highly corrrelated
# -> tending to exactly 1 correlation-> we don't need all 3 of them
# but rather just 1 of them!
# all 3 of them are functions of 1 variable called radius!!!


# Learning no. 2:
# ID column-> we see that ID column is NOT correlated with
# absolutely anything else! It seems to be a totally USELESS
# dimension! Do we remove it? Or do we conduct more tests to
# validate that its indeed useless!!


# Useful v/s useless columns:
# if a variable (dimension or column) is impacting either INPUTS
# or OUTPUT then it is useful, otherwise useless!

In [None]:
import matplotlib.pyplot as plt
diagnosis = data.loc[:,'diagnosis']
encoding_logic = lambda val: 'red' if val=='M' else 'blue'
diagnosis_encoded = diagnosis.map(encoding_logic) # logic will be applied to EVERY element of diagnosis

inps = data.loc[:, all_cols]
plt.figure() # CLEAN the graph buffer before writing anything into it
sm = pd.plotting.scatter_matrix(inps, c=diagnosis_encoded, figsize=(10,10))
plt.show()

In [None]:
# taking a smaller sample for better visuals

inps = data.loc[:, all_cols[:4]]
plt.figure() # CLEAN the graph buffer before writing anything into it
sm = pd.plotting.scatter_matrix(inps, c=diagnosis_encoded, figsize=(10,10))
plt.show()

# ID -> special interest -> red and blue are all over the place
# and their trajectory is completely parallel to both X-axis
# and y-axis! This means-> that change output or other inputs
# by any tiny value (dx), our column is NOT GOING TO HAVE ANY
# IMPACT WHATSOEVER!

In [None]:
# check if ID column has all unique values

data['id'].value_counts().count()
# value_counts gives you unique/distinct values in a column

In [None]:
len(data)
# len of data is same as distinct values in ID, and hence
# ID can be used as 'primary key' or an INDEX of its own!!!

In [None]:
data.index
# our current index has no special purpose and is just a counter!

In [None]:
data = data.set_index('id')
data.tail()

In [None]:
all_cols.remove('id')

In [None]:
all_cols

In [None]:
inps = data.loc[:, all_cols[:10]]
bins = 12 # relative in nature
plt.figure(figsize=(10,10))
# enumerate([a,b,c]) -> [(0,a),(1,b), (2,c)]
for i, feature in enumerate(inps.columns):
  rows = 5
  cols = 2
  plt.subplot(rows,cols, i+1 )
  sns.distplot(data[data['diagnosis']=='M'][feature], bins=bins, color='red', label='M')
  sns.distplot(data[data['diagnosis']=='B'][feature], bins=bins, color='blue', label='B')
  plt.legend(loc='upper right')

plt.show()


In [None]:
# for sake of simplicity, we will continue
# with first 10 variables

In [None]:
# prediction?

# Statistics, Probability,

# 20, 21, 22, 23, ? ??? -> 24.25.....-> stats can't be applied everywhere!

# probability can be easily calculated between dies, dependencies

# what if using statistics we could figure out a geometric pattern
# in our data? and then using probability, convert into a
# 'probabilistic outcome' -> ML

In [None]:
# ML => y = mx + c

# outcome = weights * inputs + bias

# diagnosis = w1*radius_mean + w2*texture_mean + w3*area_mean....
#         w10*fractal_dim_mean + bias

# 2a+b=1, a-b=-1, a=?,b=? -> simultaneous equations

# we already know diagnosis, we already know the values of radius, texture...

# what is MACHINE trying to LEARN?

In [None]:
# we already know X and Y, so machine is trying to learn the values
# of M and C such that equation makes sense and delivers least errors!

In [None]:
# diagnosis = w1*radius_mean + w2*texture_mean + w3*area_mean....
#         w10*fractal_dim_mean + bias

# diagnosis is known, and so are features
# so ml is finding [w1,w2...w10], [bias]

# We have now is an ALGORITHM
# as we try to fit it into data, we will create a "MODEL"
# algorithm is logic, model is actual numbers and values!

In [None]:
# y = 4ax2 + c is an equation or ALGO
# diagnosis = 4.44 * radius^2 + 2 is a MODEL

In [None]:
# Linear Regression

# Regression-> continuous numbers
# classification-> binary-> this line can also be used to classify
# areas above the line, and below the line!

In [None]:
x = data.loc[:, all_cols]
y = data.loc[:, 'diagnosis']
#print(type(y))
#print(y[:5])
logic = {'M':1, 'B':0}
lambda_logic = lambda val: logic[val]
y = y.map(lambda_logic)

from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

In [None]:
print(ytrain.tail()) # verify its all 1 and 0, and no longer M and B

In [None]:
xtrain.tail() # validate that IDs of xtrain and ytrain are still corresponding
# even after shuffling and distributing the data!

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression() # starts with a WRONG equation

# Training data is used to teach the model how to FIT into data

# testing data (scoring data) is used to figure out how good or bad our equation is!
model.fit(xtrain, ytrain)

In [None]:
predictions = model.predict(xtest)
# ONLY TESTING DATA PROVIDED
# Model will now provide us predictions
# which we can verify against ytest outcomes that we already know!

In [None]:
predictions[:5]

In [None]:
ytest[:5]