# Data Cleaning notebook

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
data = pd.read_csv('Raw_Skills_Dataset.csv')

In [3]:
data

Unnamed: 0,RAW DATA
0,What ifs
1,seniority
2,familiarity
3,functionalities
4,Lambdas
...,...
34111,negotiation
34112,deadlines
34113,"Self-motivated, enthusiastic and strong drive"
34114,negotiation


In [4]:
data.value_counts()

RAW DATA                    
Python                          272
AWS                             239
JavaScript                      220
Kubernetes                      180
Familiarity                     172
                               ... 
SDLC compliance                   1
SDRG                              1
SDTM specifications document      1
SEITs                             1
🏗                                 1
Length: 15677, dtype: int64

In [5]:
stop_words = set(stopwords.words('english'))

## 1. Removing stopwords 

In [6]:
#removing stopwords
sent=[]
s=""
for i in range(len(data)):
    words = word_tokenize(str(data.iloc[i].values[0]))
    s=""
    for j in words:
        if j.lower() not in stop_words:
            s=s+" "+j
    sent.append(s.strip())

#Sent 

## 2. Replacing several irrelevant puntuations 

In [7]:
for i in range(len(sent)):
    sent[i]=sent[i].replace('.','').replace('e.g','').\
                    replace(':','').replace('- ','-').\
                    replace('( ','(').replace(' )',')').\
                    replace('  ',' ').replace(', ',',').replace('Etc','').\
                    replace(' ,',',').replace(' /','/').replace('etc.','').\
                    replace('/ ','/').replace('eg ','').\
                    replace('ie ','').replace(' i.e','').replace('Eg',' ').replace('Ie','').strip().strip("""'(,.\-"'""")


In [8]:
#making first word of each data point capital
for i in range(len(data)):
    st=sent[i]
    if st!='':
        st = st.replace(st[0],st[0].upper(),1)
        sent[i]=st
#sent

## 3. Creating dataframe for result 

In [10]:
df = pd.DataFrame(sent,columns=['Technology Skills'])

In [11]:
df

Unnamed: 0,Technology Skills
0,Ifs
1,Seniority
2,Familiarity
3,Functionalities
4,Lambdas
...,...
34111,Negotiation
34112,Deadlines
34113,"Self-motivated,enthusiastic strong drive"
34114,Negotiation


In [12]:
df.value_counts()

Technology Skills               
Python                              318
AWS                                 256
JavaScript                          221
Familiarity                         221
Kubernetes                          196
                                   ... 
Firmware functionality                1
Firmware operation                    1
Firmware software requirements        1
Firmware/software QA Development      1
🏗                                     1
Length: 14502, dtype: int64

In [13]:
df.value_counts()[df.value_counts()>10]

Technology Skills                      
Python                                     318
AWS                                        256
JavaScript                                 221
Familiarity                                221
Kubernetes                                 196
                                          ... 
Backend software engineering experience     11
Better scalability                          11
Quality assurance                           11
Automated deployment pipelines              11
Threat modeling                             11
Length: 453, dtype: int64

In [14]:
#making list of some irrelevant datapoint
words_to_delete=['Familiarity','Visibility','Team mates','Observability','Deliverables','Deadlines','Documentation','Mentor','Willingness','Frameworks','Protocols','● Experience','Large dataset','Schedules','Fluency','Seniority','Negotiation','Functionalities']

In [15]:
#removing duplicates
df.drop_duplicates(keep='first',inplace=True)


In [16]:
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,Technology Skills
0,Ifs
1,Seniority
2,Familiarity
3,Functionalities
4,Lambdas
...,...
14497,Telecom product
14498,Leadership qualities
14499,Actalent
14500,"Self-motivated,enthusiastic strong drive"


In [17]:

invalid_data_index=[]
for i in range(len(df)):
    if df.iloc[i].values[0] in  words_to_delete:
        invalid_data_index.append(i)

In [18]:
#removing irrelevant datapoint 
for i in range(len(df)):
    if len(df.iloc[i].values[0])==1 or len(df.iloc[i].values[0])==0:
        print(df.iloc[i].values[0])
        invalid_data_index.append(i)
invalid_data_index


#
+
﻿
Ø

∙
❤
🏗


[1,
 2,
 3,
 21,
 225,
 237,
 344,
 410,
 511,
 534,
 580,
 1163,
 1242,
 1250,
 1253,
 1559,
 8266,
 14501,
 188,
 1103,
 2673,
 4544,
 4655,
 7929,
 8810,
 9219,
 13330]

In [19]:
for i in invalid_data_index:
    df.drop(i,inplace=True)

In [20]:
df.value_counts()

Technology Skills                                                      
# # ENG                                                                    1
Perth                                                                      1
Performant solutions                                                       1
Performant,scalable resilient applications                                 1
Performant,scalable,secure software                                        1
                                                                          ..
Enterprise Architecture roles                                              1
Enterprise Class Financial application web portals                         1
Enterprise Cloud computing                                                 1
Enterprise Customer Care (ECC) Professional Services (PS) business unit    1
﻿Tier 1 Educational Background Preferred -IIT/NIT/DTU                      1
Length: 14475, dtype: int64

In [21]:
df.reset_index(drop=True,inplace=True)

In [22]:
df

Unnamed: 0,Technology Skills
0,Ifs
1,Lambdas
2,Java Streams
3,Object Oriented analysis
4,Relational Databases
...,...
14470,C/embedded software
14471,Telecom product
14472,Leadership qualities
14473,Actalent


In [23]:
df.to_csv('Cleaned_Data.csv',index=False)
