In [300]:
import pandas as pd


In [301]:
# site: https://archive.ics.uci.edu/ml/datasets/Skin+Segmentation

# 1st 3 cols = B,G,R
# 4th col = skin or no skin

In [302]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt'

c=pd.read_csv(url, sep = '\s+')


In [303]:
c = pd.DataFrame(c)

In [304]:
c.head()

Unnamed: 0,74,85,123,1
0,73,84,122,1
1,72,83,121,1
2,70,81,119,1
3,70,81,119,1
4,69,80,118,1


In [305]:
c.dtypes

74     int64
85     int64
123    int64
1      int64
dtype: object

In [306]:
# shifting column headers into row 
c.loc[-1] = [74, 85, 123, 1]
c.index = c.index + 1
c = c.sort_index()

In [307]:
c = c.rename(columns = {'74':'B', '85': 'G', '123': 'R', '1': 'Skin or No-Skin'})


In [308]:
c.tail()

Unnamed: 0,B,G,R,Skin or No-Skin
245052,163,162,112,2
245053,163,162,112,2
245054,163,162,112,2
245055,163,162,112,2
245056,255,255,255,2


In [309]:
y = pd.get_dummies(c['Skin or No-Skin']).iloc[:,1].dropna()


In [310]:
y = y.astype('int64')

In [312]:
y.dtypes

dtype('int64')

In [284]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: 2, dtype: uint8

In [313]:
X = c.iloc[:,:-1].dropna()

In [286]:
X.head()

Unnamed: 0,B,G,R
0,74,85,123
1,73,84,122
2,72,83,121
3,70,81,119
4,70,81,119


In [314]:
X.dtypes

B    int64
G    int64
R    int64
dtype: object

In [315]:
# normalizing values because they're on different scales
# normalizing gets them on a 0 to 1 scale
# using min-max normalization

X = X.fillna(value=0) #Fill null values
for col in X.columns:
    X[col] = (X[col]-min(X[col]))/ (max(X[col]) - min(X[col])) #We subtract the minimum and divide by the range forcing a scale of 0 to 1 for each feature

X.head()

Unnamed: 0,B,G,R
0,0.290196,0.333333,0.482353
1,0.286275,0.329412,0.478431
2,0.282353,0.32549,0.47451
3,0.27451,0.317647,0.466667
4,0.27451,0.317647,0.466667


In [316]:
# split my data into test and train datasets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


In [317]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(fit_intercept = False, C = 1e12)
model_log = logreg.fit(X_train, y_train)




In [318]:
model_log

LogisticRegression(C=1000000000000.0, class_weight=None, dual=False,
          fit_intercept=False, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [319]:
# predicting 

y_hat_test = logreg.predict(X_test)
y_hat_train = logreg.predict(X_train)

In [320]:
y_hat_test.shape # (61265,)
y_hat_train.shape # (183792,)

X_train.shape # (183792, 3)
X_test.shape # (61265, 3)
y_train.shape # (183792, 1)
y_test.shape # (61265, 1)

(61265,)

In [321]:
# seeing how well our model predicted 0s and 1s

import numpy as np

#We could subtract the two columns. If values or equal, difference will be zero. Then count number of zeros.
residuals = np.abs(y_train - y_hat_train)

print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

0    166676
1     17116
Name: 2, dtype: int64
0    0.906873
1    0.093127
Name: 2, dtype: float64


In [270]:
# so, we can see that we got a 91% no-skin value and a 9% skin value
# this means that there's a class imbalance here with the no-skins way outnumbering the skins

In [322]:
# lalsjdlf