In [27]:
import pandas as pd
import numpy as np

import sqlite3 as sq

import string

In [2]:
con = sq.connect('password_data.sqlite')

In [4]:
pw_df = pd.read_sql_query('SELECT * FROM Users', con)
pw_df

Unnamed: 0,index,password,strength
0,0,zxe870819,1
1,1,xw46454nr23l,1
2,2,soporte13,1
3,3,accounts6000webhost.com,2
4,4,c443balg,1
...,...,...,...
99995,99995,obejofi215,1
99996,99996,fmiopvxb64,1
99997,99997,czvrbun38,1
99998,99998,mymyxe430,1


# Data Cleaning

In [6]:
#Remove irrelevant column

pw_df.drop(['index'], axis=1, inplace=True)
pw_df

Unnamed: 0,password,strength
0,zxe870819,1
1,xw46454nr23l,1
2,soporte13,1
3,accounts6000webhost.com,2
4,c443balg,1
...,...,...
99995,obejofi215,1
99996,fmiopvxb64,1
99997,czvrbun38,1
99998,mymyxe430,1


In [10]:
#Check for duplicated rows
pw_df.duplicated().sum()

0

In [15]:
#Check for missing values
pw_df.isnull().any().sum()

0

In [17]:
#Check for data types
pw_df.dtypes

password    object
strength     int64
dtype: object

In [18]:
#Check if the values in numeric column are negative which would be irrelevant here
pw_df['strength'].unique()

array([1, 2, 0], dtype=int64)

# Sentiment Analysis

In [22]:
#Check how many passwords that are complete numeric
pw_df[pw_df['password'].str.isnumeric()].shape

(26, 2)

In [23]:
#Check how many passwords that are complete alphabet
pw_df[pw_df['password'].str.isalpha()].shape

(50, 2)

In [24]:
#Check how many passwords that contains both numeric and alphabet (the majority!)
pw_df[pw_df['password'].str.isalnum()].shape

(97203, 2)

In [25]:
#Check how many passwords that are with upper case
pw_df[pw_df['password'].str.isupper()].shape

(1506, 2)

In [26]:
#Check how many passwords that begins with upper case (title case)
pw_df[pw_df['password'].str.istitle()].shape

(932, 2)

In [29]:
#Check how many passwords that contains special characters
def spec_char(row):
    for c in row:
        if c in string.punctuation:
            return 1
        else:
            pass
        
pw_df[pw_df['password'].apply(spec_char)==1].shape

(2663, 2)

# Feature Engineering

In [38]:
##To extract possible features that could be useful in predicting the strength of a password

#Password length
pw_df['length'] = pw_df['password'].str.len()

#Lowercase occurrence ratio
def lowercase_ratio(row):
    return np.round(len([lc for lc in row if lc.islower()])/len(row),3)
    
pw_df['lowercase_ratio'] = pw_df['password'].apply(lowercase_ratio)

#uppercase occurrence ratio
def uppercase_ratio(row):
    return np.round(len([uc for uc in row if uc.isupper()])/len(row),3)
    
pw_df['uppercase_ratio'] = pw_df['password'].apply(uppercase_ratio)

#numeric occurrence ratio
def num_ratio(row):
    return np.round(len([nc for nc in row if nc.isdigit()])/len(row),3)
    
pw_df['num_ratio'] = pw_df['password'].apply(num_ratio)

#special character occurrence ratio
def spec_ratio(row):
    return np.round(len([sc for sc in row if sc in string.punctuation])/len(row),3)
    
pw_df['spec_ratio'] = pw_df['password'].apply(spec_ratio)

pw_df

Unnamed: 0,password,strength,length,lowercase_ratio,uppercase_ratio,num_ratio,spec_ratio
0,zxe870819,1,9,0.333,0.0,0.667,0.000
1,xw46454nr23l,1,12,0.417,0.0,0.583,0.000
2,soporte13,1,9,0.778,0.0,0.222,0.000
3,accounts6000webhost.com,2,23,0.783,0.0,0.174,0.043
4,c443balg,1,8,0.625,0.0,0.375,0.000
...,...,...,...,...,...,...,...
99995,obejofi215,1,10,0.700,0.0,0.300,0.000
99996,fmiopvxb64,1,10,0.800,0.0,0.200,0.000
99997,czvrbun38,1,9,0.778,0.0,0.222,0.000
99998,mymyxe430,1,9,0.667,0.0,0.333,0.000
