## Data Analysis

In [1]:
#Working with data frames
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('RawStrokeData_Train.csv')
test = pd.read_csv('RawStrokeData_Test.csv')

In [3]:
train.head()

Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,TreatmentB,TreatmentC,TreatmentD,stroke_in_2018
0,16053,"F, 61",1,0,1,government?Remote,71.67,36.6,non-smoker,,,,,0
1,1459,"F, 30",0,0,0,Remote?private_sector,107.95,30.4,quit,,,,,0
2,7678,"F, 51",1,0,1,government?Remote,76.49,42.1,active_smoker,,,,,0
3,34943,"F, 54",0,0,1,government?City,113.98,57.3,quit,,,,,0
4,17741,"M, 27",0,0,0,private_sector?City,70.6,26.7,active_smoker,,,,,0


In [4]:
test.head()

Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,TreatmentB,TreatmentC,TreatmentD
0,33327,"F, 36",0.0,0.0,1.0,private_sector?Remote,76.05,33.4,active_smoker,,,,
1,839,"F, 40",0.0,0.0,1.0,City?government,73.77,30.1,non-smoker,,,,
2,11127,"M, 59",0.0,0.0,1.0,business_owner?Remote,62.95,30.8,,,,,
3,20768,"33, F",0.0,0.0,1.0,private_sector?City,68.81,36.5,quit,,,,
4,37774,"F, 22",0.0,0.0,0.0,private_sector?City,122.89,30.8,active_smoker,,,,


# Data Cleaning

## Age and Gender
This one was particularly hard, and not because it was together in one column. Sometimes age and gender was swapped, entered incorrectly, or written as a word. 

In [5]:
#Define a function that converts numbers in words to integers
def text2int(textnum, numwords={}):
    try:
        if not numwords:
            units = [
                "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
                "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
                "sixteen", "seventeen", "eighteen", "nineteen",]
            
            tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
                    "eighty", "ninety"]

            scales = ["hundred", "thousand", "million", "billion", "trillion"]

            numwords["and"] = (1, 0)
            for idx, word in enumerate(units):    numwords[word] = (1, idx)
            for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
            for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

        current = result = 0
        for word in textnum.split():
            if word not in numwords:
                raise Exception("Illegal word: " + word)

            scale, increment = numwords[word]
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0

        return int(result + current)
    except:
        return textnum

def inter(a):
    try:
        b = float(a)
        return b
    except:
        return 0
print('done')

done


In [6]:
def age_gender(df):
    
                ### Gender
        
    #Splitting 'sex and age' into separaate columns on a dataframe
    age_gender = df['sex and age'].str.split(',',expand = True)
    age_gender[0] = age_gender[0].astype(str).apply(lambda x: x.lower().strip())
    age_gender[1] = age_gender[1].astype(str).apply(lambda x: x.lower().strip())
    
    # Create a zero array to store values
    Age = np.zeros((len(df['sex and age']),1))
    Male = np.zeros((len(df['sex and age']),1))
    Female = np.zeros((len(df['sex and age']),1))
    Other = np.zeros((len(df['sex and age']),1))

    male_list = ['m', 'male', 'mmale', 'mm'] #Possible values of Male 
    female_list = ['f', 'female', 'femalle']

    for col in range(2):
        age_gender_left = age_gender[col]     # For each column in age_gender
        for i in range(len(age_gender_left)):
            x = age_gender_left[i]            # Set each observation as 'x'
            if x in male_list:
                Male[i] = 1                   # If x is Male, encode as 1
            elif x in female_list:
                Female[i] = 1                 # If x is not Male, but is femal encode as 
            elif x == 'other':
                Other[i] = 1
    
                ### Age
    
    # For each observation in the array
    for col in range(2):
        for i in range(len(Age)):
            x = age_gender[col][i]
            # Skip Gender classifiers
            if x in male_list + female_list +['other','f','m','nan',np.nan]:
                continue
            # Convert the word numbers numerical numbers, convert to type(int)
            x = text2int(x)
            if inter(x) != 0:
                Age[i] = inter(x)
    return Male, Female, Other, Age          
print('done')

done


In [7]:
# Perform the function on data
train['Male'], train['Female'], train['Other'], train['Age'] = age_gender(train)
test['Male'], test['Female'], test['Other'], test['Age'] = age_gender(test)

#Test
cols = ['Male','Female','Other','Age']
for col in cols:
    print(col)
    a = train[col].value_counts().head(15)
    print(a)
    print()

Male
0.0    20601
1.0    14271
Name: Male, dtype: int64

Female
1.0    20557
0.0    14315
Name: Female, dtype: int64

Other
0.0    34865
1.0        7
Name: Other, dtype: int64

Age
54.0    579
56.0    573
55.0    565
60.0    565
81.0    550
53.0    550
58.0    545
50.0    537
57.0    536
52.0    536
47.0    531
51.0    521
48.0    520
49.0    518
59.0    517
Name: Age, dtype: int64



## Smoking Status
Didn't do this with pd.get_dummies() because of so many different invalid values.

In [8]:
#Cleaning smoker_status ['quit', 'non-smoker', 'active_smoker']
def parse_smoker(df):
    
    # OHE for smoking
    quit = np.zeros((len(df['smoker_status']),1))
    nonSmoker = np.zeros((len(df['smoker_status']),1))
    activeSmoker = np.zeros((len(df['smoker_status']),1))

    for i in range(len(nonSmoker)):
        x = df['smoker_status'][i]
        if x == 'quit':
            quit[i] = 1
        elif x == 'active_smoker':
            activeSmoker[i] = 1
        else:
            nonSmoker[i] = 1
    return quit, nonSmoker, activeSmoker

train['Quit'], train['NonSmoker'], train['ActiveSmoker'] = parse_smoker(train)
test['Quit'], test['NonSmoker'], test['ActiveSmoker'] = parse_smoker(test)

#Test
cols = ['Quit', 'NonSmoker', 'ActiveSmoker']
for col in cols:
    print(col)
    a = train[col].value_counts().head(15)
    print(a)
    print()

Quit
0.0    28842
1.0     6030
Name: Quit, dtype: int64

NonSmoker
1.0    23514
0.0    11358
Name: NonSmoker, dtype: int64

ActiveSmoker
0.0    29544
1.0     5328
Name: ActiveSmoker, dtype: int64



## Job Status and Living Area

In [9]:
def job_loc_parse(df):
    
    #Splitting job_status and living_area column
    job_loc = df['job_status and living_area'].str.split('?',expand = True)
    job_loc[0] = job_loc[0].astype(str).apply(lambda x: x.lower().strip())
    job_loc[1] = job_loc[1].astype(str).apply(lambda x: x.lower().strip())
        
                # Job Status

    #creating vectors for each job type
    Government = np.zeros((len(df['job_status and living_area']),1))
    Private = np.zeros((len(df['job_status and living_area']),1))
    Business = np.zeros((len(df['job_status and living_area']),1))
    Unemployed = np.zeros((len(df['job_status and living_area']),1))
    Leave = np.zeros((len(df['job_status and living_area']),1))
    
    Remote = np.zeros((len(df['job_status and living_area']),1))
    City = np.zeros((len(df['job_status and living_area']),1))

    #conditional cases to fit into 
    for col in range(2):
        job_loc_left = job_loc[col]
        for i in range(len(job_loc_left)):
            x = job_loc_left[i]
            if x in ['government', 'govt.']:
                Government[i] = 1
            elif x in ['private_sector', 'privattte', 'private', 'private sector']:
                Private[i] = 1
            elif x in ['biz', 'business_owner', 'business owner']:
                Business[i] = 1
            elif x in ['unemployed', 'null', '']:
                Unemployed[i] = 1
            elif x in ['leave', 'parental_leave', 'parental leave']:
                Leave[i] = 1

                # Job Status
        
    #conditional cases to fit into
    for col in range(2):
        job_loc_right = job_loc[col]
        for i in range(len(job_loc_left)):
            x = job_loc_right[i]
            if x in ['remote', 'r']:
                Remote[i] = 1
            elif x in ['city', 'c']:
                City[i] = 1

    return Government, Private, Business, Unemployed, Leave, City, Remote

train['Job: Gov'], train['Job: Pvt'], train['Job: Bus'], train['Job: Unemp'], train['Job: Parental'], train['Loc: City'], train['Loc: Remote'] = job_loc_parse(train)
test['Job: Gov'], test['Job: Pvt'], test['Job: Bus'], test['Job: Unemp'], test['Job: Parental'], test['Loc: City'], test['Loc: Remote']  = job_loc_parse(test)

In [10]:
#testing output
train.loc[:10,
          ['job_status and living_area','Job: Gov','Job: Pvt','Job: Bus',
           'Job: Unemp','Job: Parental','Loc: City','Loc: Remote']
         ]

Unnamed: 0,job_status and living_area,Job: Gov,Job: Pvt,Job: Bus,Job: Unemp,Job: Parental,Loc: City,Loc: Remote
0,government?Remote,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Remote?private_sector,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,government?Remote,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,government?City,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,private_sector?City,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,private_sector?Remote,0.0,1.0,0.0,0.0,0.0,0.0,1.0
6,private_sector?City,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7,business_owner?Remote,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8,private_sector?City,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,private_sector?City,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Binary Categorical Variables
As above with smoking - many unclean values. This is easier

In [11]:
#Binary Catagorical variables --> Set all to 0 if not 1

toClean = ['TreatmentA', 'TreatmentB', 'TreatmentC', 'TreatmentD',
           'stroke_in_2018','high_BP','heart_condition_detected_2017', 'married']
def binary_encode(df):
    for var in toClean:
        if var in df.columns:
            Clean = np.zeros((len(df['sex and age']),1))
            for i in range(len(Clean)):
                if (df[var][i] == '1') | (df[var][i] == 1):
                    Clean[i] = 1
            df[var] = Clean
    return

binary_encode(train)
binary_encode(test)

#testing output
for var in toClean:
    print(var)
    print(train[var].value_counts())
    print()

TreatmentA
0.0    34563
1.0      309
Name: TreatmentA, dtype: int64

TreatmentB
0.0    34399
1.0      473
Name: TreatmentB, dtype: int64

TreatmentC
0.0    34555
1.0      317
Name: TreatmentC, dtype: int64

TreatmentD
0.0    34507
1.0      365
Name: TreatmentD, dtype: int64

stroke_in_2018
0.0    34220
1.0      652
Name: stroke_in_2018, dtype: int64

high_BP
0.0    31609
1.0     3263
Name: high_BP, dtype: int64

heart_condition_detected_2017
0.0    33218
1.0     1654
Name: heart_condition_detected_2017, dtype: int64

married
1.0    22405
0.0    12467
Name: married, dtype: int64



## Continuous Variables

In [12]:
def parse_cont(df):
    #Cleaning 'average_blood_sugar'
    var = 'average_blood_sugar'
    df[var].replace(["?",'.'], np.nan, inplace = True)
    avg_1 = df[var].astype("float").mean(axis = 0)
    df[var].replace(np.nan, avg_1, inplace = True)
    df[var] = df[var].astype('float')

    #Cleaning 'BMI'
    var = 'BMI'
    df[var].replace(["?",'.'], np.nan, inplace = True)
    avg_1 = df[var].astype("float").mean(axis = 0)
    df[var].replace(np.nan, avg_1, inplace = True)
    df[var] = df[var].astype('float')
    return

parse_cont(train)
parse_cont(test)

## Dropping

In [13]:
train.columns

Index(['id', 'sex and age', 'high_BP', 'heart_condition_detected_2017',
       'married', 'job_status and living_area', 'average_blood_sugar', 'BMI',
       'smoker_status', 'TreatmentA', 'TreatmentB', 'TreatmentC', 'TreatmentD',
       'stroke_in_2018', 'Male', 'Female', 'Other', 'Age', 'Quit', 'NonSmoker',
       'ActiveSmoker', 'Job: Gov', 'Job: Pvt', 'Job: Bus', 'Job: Unemp',
       'Job: Parental', 'Loc: City', 'Loc: Remote'],
      dtype='object')

In [14]:
#Categorising columns and dropping unecessary ones
Drop = ['sex and age', 'job_status and living_area','smoker_status']
Cat = ['Male', 'Female', 'Other', 'Job', 'Loc','Quit', 'NonSmoker', 'ActiveSmoker']
Bin = ['TreatmentA', 'TreatmentB', 'TreatmentC', 'TreatmentD',
       'stroke_in_2018','high_BP','heart_condition_detected_2017', 'married']
Cont = ['average_blood_sugar', 'BMI','Age']

In [15]:
train.drop(train[Drop], axis = 1, inplace = True)
test.drop(test[Drop], axis = 1, inplace = True)

In [16]:
#testing output
train.describe()

Unnamed: 0,id,high_BP,heart_condition_detected_2017,married,average_blood_sugar,BMI,TreatmentA,TreatmentB,TreatmentC,TreatmentD,...,Quit,NonSmoker,ActiveSmoker,Job: Gov,Job: Pvt,Job: Bus,Job: Unemp,Job: Parental,Loc: City,Loc: Remote
count,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,...,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0,34872.0
mean,21733.106246,0.093571,0.047431,0.642493,109.176206,30.929459,0.008861,0.013564,0.00909,0.010467,...,0.172918,0.674295,0.152787,0.125803,0.570601,0.157318,0.004416,0.141116,0.500545,0.498365
std,12526.522382,0.291235,0.212561,0.479273,43.186101,7.644195,0.093716,0.115673,0.094911,0.101772,...,0.378182,0.468645,0.359788,0.331632,0.494997,0.364106,0.066308,0.348146,0.500007,0.500004
min,2.0,0.0,0.0,0.0,59.6,12.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10883.75,0.0,0.0,0.0,82.12,25.7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,21752.5,0.0,0.0,1.0,96.27,30.4,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,32617.25,0.0,0.0,1.0,116.74,35.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,43399.0,1.0,1.0,1.0,295.65,99.9,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
#testing output
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34872 entries, 0 to 34871
Data columns (total 25 columns):
id                               34872 non-null int64
high_BP                          34872 non-null float64
heart_condition_detected_2017    34872 non-null float64
married                          34872 non-null float64
average_blood_sugar              34872 non-null float64
BMI                              34872 non-null float64
TreatmentA                       34872 non-null float64
TreatmentB                       34872 non-null float64
TreatmentC                       34872 non-null float64
TreatmentD                       34872 non-null float64
stroke_in_2018                   34872 non-null float64
Male                             34872 non-null float64
Female                           34872 non-null float64
Other                            34872 non-null float64
Age                              34872 non-null float64
Quit                             34872 non-null float

## Saving

In [18]:
#testing output
train.head(20)

Unnamed: 0,id,high_BP,heart_condition_detected_2017,married,average_blood_sugar,BMI,TreatmentA,TreatmentB,TreatmentC,TreatmentD,...,Quit,NonSmoker,ActiveSmoker,Job: Gov,Job: Pvt,Job: Bus,Job: Unemp,Job: Parental,Loc: City,Loc: Remote
0,16053,1.0,0.0,1.0,71.67,36.6,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1459,0.0,0.0,0.0,107.95,30.4,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,7678,1.0,0.0,1.0,76.49,42.1,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,34943,0.0,0.0,1.0,113.98,57.3,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,17741,0.0,0.0,0.0,70.6,26.7,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,26198,0.0,0.0,1.0,109.176206,30.929459,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
6,7284,0.0,0.0,1.0,100.85,37.5,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7,10704,0.0,1.0,1.0,188.39,30.929459,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8,41988,0.0,0.0,0.0,161.72,23.1,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,12578,0.0,0.0,1.0,91.02,30.4,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [19]:
#Training
train.to_csv('train_clean.csv', sep=',')
test.to_csv('test_clean.csv', sep=',', index = False)