# Proyecto Final IH: Recomendador de Codewars|Github

In [None]:
# https://www.codewars.com/users/leaderboard

In [3]:
# imports 

import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup
from IPython.display import display
import time

from funciones_scraping import get_languages
from funciones_tiempo import timeit
from funciones_files import save_set_users, save_string_users, add_set_users, load_set_users

pd.options.display.max_columns = None

## Fase 1: Obtención de usuarios de forma iterativa

### Leaderboard
Top 500 de codewars

In [5]:
# usuarios de leaderboard

@timeit
def get_top_500(url='https://www.codewars.com/users/leaderboard'): 
    html = requests.get(url).content
    soup = BeautifulSoup(html, "lxml")
    return {e.text for e in soup.select('tr a')}

In [6]:
top500 = get_top_500()
total = set().union(top500)
len(total)

'get_top_500'  2699.78 ms


499

### Get Social

In [7]:
# get users in clan

# https://www.codewars.com/users/albertogcmr/following
# https://www.codewars.com/users/albertogcmr/followers
# https://www.codewars.com/users/albertogcmr/allies

def get_social(user): 
    res = set()
    for link in ['following', 'followers', 'allies']: 
        url = 'https://www.codewars.com/users/{}/{}'.format(user, link)
        html = requests.get(url).content

        soup = BeautifulSoup(html, "lxml")
        res = res.union({e.text for e in soup.select('table a')})
    return res

# get_social('albertogcmr')

### Iteración para ampliar número de usuarios

A partir de la semilla inicial de 500 obtenemos casi 4000 usuarios de codewars

In [8]:
# Cuarta iteración

iteracion = ['../output/usuarioscodewars-iter1.txt', 
             '../output/usuarioscodewars-iter2.txt', 
             '../output/usuarioscodewars-iter3.txt']

# save_set_users(users, filename=iteracion[2])
total = load_set_users(iteracion[2])
len(total)

11520

In [9]:
@timeit
def get_social_from_users_set(users_set): 
    res = users_set
    errors = []
    for i, user in enumerate(users_set): 
        try: 
            res = res.union(get_social(user))
        except: 
            errors.append(user)
            
    save_set_users(users=errors, filename='../errors/social-error.txt')
    return res

In [None]:
# Me añado a mí mismo y a algunos usuarios más

total = total.union({'albertogcmr', 'boyander', 'VictorIrix'})
for user in get_social('albertogcmr'): 
    total = total.union(get_social(user))
len(total)

### Persistencia de datos en un txt

In [10]:
save_set_users(users=total, filename=iteracion[2])
total = load_set_users(iteracion[2])

In [11]:
len(total)

11520

## Fase 2: Obtención de datos de usuario
### 2.1 API codewars

In [12]:
# GET user

def get_user_api(user='albertogcmr'): 
    ''' Obtenemos el json de la API de codewars '''
    url = 'https://www.codewars.com/api/v1/users/{}'.format(user)
    response = requests.get(url)
    return response.json()

# get_user_api('albertogcmr')

In [13]:
def get_value_from_json(user_json, value='username'): 
    ''' 
    value en nivel 1 del json: 
    possible values = ['username', 'honor', 'clan', 'leaderboardPosition', 'skills']
    '''
    return user_json[value]

def get_score_language(user_json, language): 
    try: 
        score = user_json['ranks']['languages'][language]['score']
    except: 
        score = 0
    return score


def get_scores(user_json): 
    res = {}
    for lang in get_languages(): 
        res[lang] = get_score_language(user_json, lang)
    return res

# get_score_language(user_json=alberto, language='sca')
# get_value_from_json(alberto, 'skills')
# get_scores(alberto)

### 2.2 Web Scrapping codewars

Get stats from user

In [14]:
def get_all_stats(user): 
    url = 'https://www.codewars.com/users/{}'.format(user)
    html = requests.get(url).content
    soup = BeautifulSoup(html, "lxml")

    x = {}
    x['username'] = user
    for s in soup.select('.stat-box div'): 
        if s.text.split(':')[0] != 'Profiles': 
            x[s.text.split(':')[0].lower()] = s.text.split(':')[1]
        else: 
            try: 
                for e in s.find_all('a', href=True): 
                    if 'github' in e['href']: 
                        x['github'] = e['href']
                    if 'linkedin' in e['href']: 
                        x['linkedin'] = e['href']
            except: 
                x['github'] = ''
                x['linkedin'] = ''
    return x

# get_all_stats('albertogcmr')

In [18]:

languages = get_languages()
languages

{'bf (beta)',
 'c',
 'c#',
 'c++',
 'clojure',
 'coffeescript',
 'crystal',
 'dart',
 'elixir',
 'elm (beta)',
 'erlang (beta)',
 'f#',
 'fortran (beta)',
 'go',
 'groovy (beta)',
 'haskell',
 'java',
 'javascript',
 'julia (beta)',
 'kotlin (beta)',
 'lua (beta)',
 'nasm (beta)',
 'nim (beta)',
 'objective-c (beta)',
 'ocaml (beta)',
 'php',
 'powershell (beta)',
 'purescript (beta)',
 'python',
 'r (beta)',
 'ruby',
 'rust',
 'scala (beta)',
 'shell',
 'solidity (beta)',
 'sql',
 'swift',
 'typescript'}

In [19]:
def get_row(user): 
    ''' 
    Crea un diccionario con los datos recopilados que se puede
    añadir como fila en nuestro dataframe de pandas
    '''
    user_json = get_user_api(user)
    res = get_all_stats(user)
    res.update(get_scores(user_json))
    return res

def create_row(data, new_user): 
    row = get_row(new_user)
    return data.append(other=row, ignore_index=True)

# ivan = get_row('ijcernicharo')
# alberto = get_row('albertogcmr')

### 2.3 Creación de DataFrame

In [20]:
# Por cada uno de los usuarios recopilados, creamos un registro en df_users
total_n = [u for u in total][:]
len(total_n)

11520

In [21]:
@timeit
def create_df(users): 
    # print(users)
    df_res = pd.DataFrame()
    errors = []
    for i, user in enumerate(users): 
        try: 
            print(i, '\t', user)
            df_res = create_row(data=df_res, new_user=user)
        except: 
            errors.append(user)
    try: 
        save_set_users(users=errors, filename='../errors/df-row-error.txt')
    except: 
        print('Error en la escritura del archivo de error')
        print(errors)
    
    return df_res

In [None]:
df_users = create_df(total_n)
df_users.head()

# iter 1 'create_df'  1494739.36 ms (500 users)
# iter 2 'create_df'  9796075.92 ms (3939 users)
# iter 3 'create_df'  28818747.81 ms (11501 users)

## Fase 3: Persistencia de datos

In [22]:
df_iter = ['../output/df-codewars-iter1.csv', 
           '../output/df-codewars-iter2.csv', 
           '../output/df-codewars-iter3.csv']


# df_users.to_csv(df_iter[2])
df = pd.read_csv(df_iter[2])

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
'linkedin' in df.columns

True

In [25]:
df = pd.read_csv('../output/df-codewars-iter3.csv')
df.shape

(11501, 239)

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,allies,bf (beta),c,c#,c++,clan,clojure,coffeescript,collections,comments,crystal,dart,elixir,elm (beta),erlang (beta),f#,followers,following,fortran (beta),go,groovy (beta),haskell,highest trained,honor,honor percentile,java,javascript,julia (beta),kotlin (beta),kumite,last seen,leaderboard position,lua (beta),member since,most recent,name,nasm (beta),nim (beta),objective-c (beta),ocaml (beta),php,powershell (beta),purescript (beta),python,r (beta),rank,ruby,rust,scala (beta),shell,solidity (beta),sql,swift,total completed kata,total languages trained,translations,typescript,username,github,skills,linkedin,algebra,algorithms,avg. rank,avg. satisfaction rating,created,fundamentals,logic,mathematics,numbers,total collected,total completions,total stars,arithmetic,arrays,data types,computability theory,recursion,theoretical computer science,games,puzzles,utilities,3 kyu,4 kyu,5 kyu,6 kyu,7 kyu,8 kyu,authored kata,authored translations,best practice,best practice solutions,clever,clever solutions,completed kata,completed on 1st attempt,contributed kata,current streak,date,ended on,first completed,kata approvals,kata attempts,last completed,most number of days,basic language features,classes,control flow,functions,modules,programming paradigms,binary,data,strings,2 kyu,validation,exception handling,security,data structures,linked lists,lists,geometry,functional programming,computational science,state machines,expressions,ascii,character encodings,dates/time,formats,decoding,encoding,prototypes,graphs,parsing,advanced language features,regular expressions,1 kyu,loops,objects,object-oriented programming,dynamic programming,data conversion,immutability,mutability,statistics,sorting,declarative programming,interpreters,optimization,performance,integers,cryptography,encryption,refactoring,bugs,io,streams,design patterns,memoization,permutations,trees,bash,networks,search,iterators,conditional statements,asynchronous,design principles,promises,singleton,esoteric languages,formatting,higher-order functions,filtering,best practices,machine learning,decryption,babel,es2015,physics,game boards,variables,big integers,sets,frameworks,react,lambdas,hashes,methods,databases,information systems,logic programming,applied computer science,angular,observers,hacking holidays,vectors,binary search trees,rules,nodejs,queues,ciphers,sequences,memory,metaprogramming,reflection,chars,django,tables,arguments,dictionary,case/switch statements,map/reduce,bits,inheritance,polymorphism,scopes,event handling,jsx,testing,graphics,prototype-based programming,unicode,recursion theory,json,interview questions,ranking,weak typing,maps,reporting,operators,closures,sparse arrays,properties,booleans,bitwise operators,decimals
0,0,1.0,0.0,0.0,0.0,0.0,Hagerty,0.0,0.0,0.0,0 (0 replies),0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,JavaScript (5 kyu),560,96th,0.0,581.0,0.0,0.0,0,Aug 2018,"#19,926",0.0,Oct 2014,C#,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4 kyu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45,2.0,0 (0 approved),0.0,brkiesel,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,374.0,0.0,0.0,0.0,0.0,FreeCodeCamp,0.0,0.0,1.0,0 (0 replies),0.0,0.0,0.0,0.0,0.0,0.0,377.0,386.0,0.0,0.0,0.0,0.0,JavaScript (3 kyu),1984,99th,14.0,4449.0,0.0,0.0,0,Sep 2018,"#2,492",0.0,Sep 2015,Python,Emanuele,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138.0,0.0,2 kyu,402.0,0.0,0.0,0.0,0.0,0.0,0.0,70,4.0,0 (0 approved),0.0,Em-Ant,https://github.com/Em-Ant,"js, node/express, python, c/c++",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,84.0,0.0,0.0,0.0,0.0,The Firehose Project,0.0,0.0,0.0,0 (0 replies),0.0,0.0,0.0,0.0,0.0,0.0,85.0,84.0,0.0,0.0,0.0,0.0,Ruby (5 kyu),335,93rd,0.0,82.0,0.0,0.0,0,Nov 2017,"#39,936",0.0,Nov 2015,JavaScript,Michael,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5 kyu,280.0,0.0,0.0,0.0,0.0,0.0,0.0,50,2.0,0 (0 approved),0.0,Mikeysax,https://github.com/Mikeysax,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,29.0,0.0,0.0,0.0,0.0,FPI,0.0,0.0,0.0,3 (1 replies),0.0,0.0,0.0,0.0,0.0,0.0,29.0,30.0,0.0,0.0,0.0,0.0,Python (2 kyu),3101,99th,20.0,4524.0,0.0,0.0,1 (1 Started),Oct 2017,"#1,223",0.0,Jul 2015,Go,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5938.0,0.0,2 kyu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139,3.0,0 (0 approved),0.0,kimiamania,https://github.com/kimiamania,python,https://www.linkedin.com/in/rezhajulio,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,13.0,0.0,0.0,0.0,0.0,University of New Brunswick,0.0,0.0,0.0,0 (0 replies),0.0,0.0,0.0,0.0,0.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,JavaScript (6 kyu),127,81st,2.0,114.0,0.0,0.0,0,Jul 2018,"#119,097",0.0,Jun 2018,R,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6 kyu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17,2.0,0 (0 approved),0.0,Daigle,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Resultado

Tras 3 iteraciones usando como semilla el leaderboard de la web de codewars alcanzamos 11501 registros para nuestro dataset con 239 columnas. Dichas columnas se pueden agrupar en 3 tipos diferentes: 
1. Datos de usuario: username, name, member since, honor, etc.
2. Scores en cada lenguaje. El listado de lenguajes se puede obtener a partir de ```get_languages()```
3. Tags: Cantidad de veces que aparece ese tag en cada uno de las katas que ha creado. Estas columnas no se incluirán el el Dataset una vez limpio pero podrían usarse para futuras mejoras. 