## Fase 4: Limpieza de datos

Tenemos más de 200 columnas de las que sólo queremos las concernientes a sus datos de programación

Nos vamos a quedar con las siguientes columnas

* languages: la correspondientes a cada uno de los lenguajes soportados en codewars
* stats sociales: allies, clan, comments, followers, following, github, etc. 
* stats estadísticos: honor, last seen, leaderboard position, etc. 


In [None]:
# imports 

import pandas as pd
import re

from funciones_scraping import get_languages

pd.options.display.max_columns = None

df = pd.read_csv('../output/codewar_users.csv', index_col=0)

def get_numeric_one(x, deletes=''): 
    try: 
        num = str(x)
        for ch in deletes: 
            num = num.replace(ch, '').strip()
        return float(num)
    except: 
        return x


def get_numeric_groups(x, groups=1): 
    try: 
        numbers = [int(number) for number in re.findall(r'\d+', x)]
        # in case x=0 or x=='0'
        if str(x) == '0': 
            numbers = [0] * groups
    except: 
        numbers = [0] * groups
    else: 
        pass
    finally: 
        # return list if groups > 1 else return first element in list
        return numbers[:groups] if groups > 1 else numbers[0]
    
def get_highest_trained(x): 
    try: 
        res = x.split('(')[0].strip()
        return res
    except: 
        return x
    
def get_honor_percentile(x): 
    try: 
        res = x.replace('Top', '').replace('%', '').strip()
        return float(res)
    except: 
        return x

In [2]:
# clean column names
df.columns = df.columns.str.replace(' ', '_').str.replace('.', '').str.lower()

# replace nan in numeric colmuns with 0
df._get_numeric_data().fillna(0, inplace=True)


# creates two new columns and delete the previous
df["authored_translations"], df["approved_translations"] = zip(*df["authored_translations"].map(lambda x: get_numeric_groups(x, groups=2)))
df["authored_translations"].value_counts()

# creates new column and delete the previous
# df['avg_rank'] = df['avg_rank'].apply(lambda x: get_numeric_groups(x, groups=1))
df['avg_rank'] = df['avg_rank'].apply(lambda x: get_numeric_one(x, deletes='kyudan')).fillna(0)

# creates new column and delete the previous
df['avg_satisfaction_rating'] = df['avg_satisfaction_rating'].apply(lambda x: get_numeric_one(x, deletes='%')).fillna(0)


# creates new column and delete the previous
df['best_practice'] = df['best_practice'].apply(get_numeric_one).fillna(0)


# creates new column and delete the previous
# df['best_practice_solutions'] = df['best_practice_solutions'].str.replace(',', '').apply(lambda x: get_numeric_groups(x, groups=1))
df['best_practice_solutions'] = df['best_practice_solutions'].apply(lambda x: get_numeric_one(x, deletes=',')).fillna(0)


# creates new column and delete the previous
# df['clever_solutions'] = df['clever_solutions'].str.replace(',', '').apply(lambda x: get_numeric_groups(x, groups=1))
df['clever_solutions'] = df['clever_solutions'].apply(lambda x: get_numeric_one(x, deletes=',')).fillna(0)

# creates two new columns and delete the previous
df["comments"], df["replies"] = zip(*df["comments"].map(lambda x: get_numeric_groups(x, groups=2)))

# creates two new columns and delete the previous
df["created"], df["beta"] = zip(*df["created"].map(lambda x: get_numeric_groups(x, groups=2)))

# cleans dates column
df['date'] = pd.to_datetime(df['date'])
df['ended_on'] = pd.to_datetime(df['ended_on'])
df['first_completed'] = pd.to_datetime(df['first_completed'])
df['last_completed'] = pd.to_datetime(df['last_completed'])
df['last_seen'] = pd.to_datetime(df['last_seen'])
df['member_since'] = pd.to_datetime(df['member_since'])

# creates new column and delete the previous
df['highest_trained'] = df['highest_trained'].apply(get_highest_trained)

# creates new column and delete the previous
df['honor'] = df['honor'].apply(lambda x: get_numeric_one(x, deletes=',')).fillna(0)

# creates new column and delete the previous
# df['honor_percentile'] = df['honor_percentile'].apply(get_honor_percentile)
df['honor_percentile'] = df['honor_percentile'].apply(lambda x: get_numeric_one(x, deletes='Top%')).fillna(0)

# creates two new columns and delete the previous
df["kumite"], df["started_kumite"] = zip(*df["kumite"].map(lambda x: get_numeric_groups(x, groups=2)))

# creates new column and delete the previous
df['leaderboard_position'] = df['leaderboard_position'].apply(lambda x: get_numeric_one(x, deletes='#')).fillna(0)

# creates new column and delete the previous
df['rank'] = df['rank'].apply(lambda x: get_numeric_one(x, deletes='kyudan')).fillna(0)

# creates new column and delete the previous
df['total_collected'] = df['total_collected'].apply(get_numeric_one).fillna(0)

# creates two new columns and delete the previous
df["translations"], df["translations_aproved"] = zip(*df["translations"].map(lambda x: get_numeric_groups(x, groups=2)))

### 4.13 Guardamos los datos

In [5]:
df.to_csv('../output/codewar_users_clean.csv')
df.shape

(13729, 239)

In [4]:
df.head()

Unnamed: 0,1_kyu,2_kyu,3_kyu,4_kyu,5_kyu,6_kyu,7_kyu,8_kyu,advanced_language_features,agda,algebra,algorithms,allies,angular,applied_computer_science,arguments,arithmetic,arrays,ascii,asynchronous,authored_kata,authored_translations,avg_rank,avg_satisfaction_rating,babel,basic_language_features,best_practice,best_practice_solutions,bf,big_integers,binary,binary_search_trees,bits,bitwise_operators,booleans,bugs,c,case/switch_statements,character_encodings,chars,ciphers,clan,classes,clever,clever_solutions,clojure,closures,coffeescript,collections,comments,completed_kata,completed_on_1st_attempt,computability_theory,computational_science,conditional_statements,contributed_kata,control_flow,coq,cpp,created,cryptography,crystal,csharp,current_streak,dart,data,data_conversion,data_structures,data_types,databases,date,dates/time,decimals,declarative_programming,decoding,decryption,design_patterns,design_principles,dictionary,dynamic_programming,elixir,elm,encoding,encryption,ended_on,erlang,es2015,esoteric_languages,event_handling,exception_handling,filtering,first_completed,followers,following,formats,formatting,fortran,frameworks,fsharp,functional_programming,functions,fundamentals,game_boards,games,geometry,github,go,graphics,graphs,groovy,hacking_holidays,hashes,haskell,higher-order_functions,highest_trained,honor,honor_percentile,idris,immutability,information_systems,inheritance,integers,interfaces,interpreters,interview_questions,io,iterators,java,javascript,json,julia,kata_approvals,kata_attempts,kotlin,kumite,lambdas,last_completed,last_seen,leaderboard_position,linked_lists,linkedin,lists,logic,loops,lua,machine_learning,map/reduce,maps,mathematics,member_since,memoization,metaprogramming,methods,modules,most_number_of_days,most_recent,mutability,name,nasm,networks,nim,nodejs,numbers,objc,object-oriented_programming,objects,observers,ocaml,optimization,parsing,performance,permutations,php,physics,polymorphism,powershell,programming_paradigms,properties,prototype-based_programming,prototypes,purescript,puzzles,python,queues,r,racket,rank,react,reason,recursion,refactoring,reflection,regular_expressions,reporting,ruby,rules,rust,scala,search,security,sequences,sets,shell,simulation,skills,social,solidity,sorting,sparse_arrays,sql,state_machines,statistics,streams,strings,swift,tables,testing,theorem_proving,theoretical_computer_science,total_collected,total_completed_kata,total_completions,total_languages_trained,total_stars,translations,trees,typescript,unicode,user,utilities,validation,variables,vectors,weak_typing,approved_translations,replies,beta,started_kumite,translations_aproved
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,5.0,76.0,0.0,0.0,0,0.0,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1187.0,0.0,0.0,0.0,0.0,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5295,0.0,0.0,0.0,0.0,0.0,167.0,0.0,0.0,3342.0,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,NaT,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,https://github.com/FArekkusu,0.0,0.0,0.0,0.0,0.0,0.0,738.0,0.0,Python,31022.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1384.0,12981.0,0.0,0.0,48.0,0.0,0.0,296,0.0,NaT,2019-06-01,31,0.0,,0.0,9.0,0.0,0.0,0.0,0.0,0.0,3.0,2017-09-01,0.0,0.0,0.0,0.0,0.0,C#,0.0,Alexander Fedorov,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18638.0,0.0,359.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4401.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"{'dmivlge', 'Blue_Velvet', 'dandgerson', 'Drag...",0.0,0.0,0.0,707.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48,3814,481.0,10.0,125.0,285,0.0,0.0,0.0,FArekkusu,0.0,0.0,0.0,0.0,0.0,0,1749,1,276,239
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,2450.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,NaT,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,JavaScript,5902.0,0.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,520.0,2621.0,0.0,0.0,0.0,0.0,0.0,5,0.0,NaT,2019-05-01,481,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-04-01,0.0,0.0,0.0,0.0,0.0,JavaScript,0.0,Toni,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"c#, .net, javascript (in learning mode)",{'aryan-firouzian'},0.0,0.0,0.0,260.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1116,,7.0,,0,0.0,37.0,0.0,bladez,0.0,0.0,0.0,0.0,0.0,0,30,0,5,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6.0,98.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Vauntz.com,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,NaT,7.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,https://github.com/marutib,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,JavaScript,8518.0,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,13038.0,0.0,0.0,0.0,0.0,0.0,5,0.0,NaT,2018-08-01,283,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014-06-01,0.0,0.0,0.0,0.0,0.0,JavaScript,0.0,Maruti Borker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"{'OverZealous', 'chuksjoe', 'Aria_vt', 'surtic...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,810,88.0,3.0,22.0,0,0.0,0.0,0.0,marutiborker,0.0,0.0,0.0,0.0,0.0,0,24,0,7,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0,6.0,82.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,155.0,0.0,0.0,0.0,0.0,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,7.0,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,938.0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,NaT,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://github.com/jungerstein,0.0,0.0,0.0,0.0,0.0,0.0,38.0,0.0,Python,7372.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,47.0,0.0,0.0,0.0,0.0,0.0,18,0.0,NaT,2019-04-01,352,0.0,,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,2014-10-01,0.0,0.0,0.0,0.0,0.0,Racket,0.0,Unknown,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,6558.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"programming in basic, arithmetic, pencil and p...","{'joolius', 'miller.a', 'UkioIkira'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,2,1239,243.0,9.0,19.0,0,0.0,0.0,0.0,jungerstein,0.0,0.0,0.0,0.0,0.0,0,18,4,21,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,21.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0,6.0,91.0,0.0,4.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,417.0,0.0,0.0,0.0,0.0,Unknown,0.0,0.0,0.0,297.0,0.0,0.0,3.0,252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,0.0,0.0,0.0,0.0,0.0,0.0,NaT,21.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,https://github.com/lilsweetcaligula,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Python,8580.0,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1795.0,0.0,0.0,0.0,0.0,0.0,2,0.0,NaT,2019-06-01,278,0.0,https://www.linkedin.com/in/lilsweetcaligula/,0.0,0.0,0.0,115.0,0.0,0.0,0.0,0.0,2016-08-01,0.0,0.0,0.0,0.0,0.0,JavaScript,0.0,Caligula,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2341.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1451.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,0.0,"pythonista, c, rubyist, javascript, scheme, cl...","{'damjan', 'hakatom', 'eb110', 'donaldsebleung...",0.0,0.0,0.0,28.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,54,1418,5151.0,9.0,133.0,124,0.0,0.0,0.0,lilsweetcaligula,0.0,0.0,0.0,0.0,0.0,0,77,6,2,64


### Resultado

Hemos generado un CSV con registros y columnas (11500x57) que hemos limpiado. Todos salvo 4 son te tipo numérico lo que nos ayudará a análisis posteriores. 
Las columnas no numéricas son: 
* username: nos permitirá identificar al usuario
* linkedin: permite su acceso por las redes sociales y por la API propia. 
* github: permite su acceso por la API propia. 
* most recent: Permitirá ejecutar algoritmos de ML supervisado para intentar predecir este valor

In [None]:
df.head()