# Proyecto Final IH: Recomendador de Codewars|Github

In [None]:
# https://www.codewars.com/users/leaderboard

In [1]:
# imports 

import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup
from IPython.display import display
import time

pd.options.display.max_columns = None

## Fase 1: Obtención de usuarios de forma iterativa

### Leaderboard
Top 500 de codewars

In [2]:
# URL

url = 'https://www.codewars.com/users/leaderboard'
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")

In [3]:
# usuarios de leaderboard
def get_top_500(url='https://www.codewars.com/users/leaderboard'): 
    html = requests.get(url).content
    soup = BeautifulSoup(html, "lxml")
    return {e.text for e in soup.select('tr a')}


In [4]:
# Salvamos los nombres en un documento

def save_set_users(users, filename='../output/codewars-users.txt'): 
    ''' Save set/list of users to file '''
    with open(filename, 'w') as f: 
        f.write('\n'.join(users)) 
# save_set_users({'hola', 'adios'})

In [5]:
def save_string_users(users, filename='../output/codewars-users.txt'): 
    ''' Save string of users to file '''
    with open(filename, 'w') as f: 
        f.write(users) 
# save_set_users(users)

In [6]:
def add_set_users(users, filename='../output/codewars-users.txt'): 
    ''' Append to file '''
    with open(filename, 'a') as f: 
        f.write(users) 
# add_set_users(users)

In [7]:
def load_set_users(filename='../output/codewars-users.txt'): 
    ''' Read from file '''
    with open(filename, 'r') as f: 
        users = f.readlines()
    return {user.strip() for user in users}
# len(load_set_users())

### Get Social

In [8]:
# get users in clan

# https://www.codewars.com/users/albertogcmr/following
# https://www.codewars.com/users/albertogcmr/followers
# https://www.codewars.com/users/albertogcmr/allies

def get_social(user): 
    res = set()
    for link in ['following', 'followers', 'allies']: 
        url = 'https://www.codewars.com/users/{}/{}'.format(user, link)
        html = requests.get(url).content

        soup = BeautifulSoup(html, "lxml")
        res = res.union({e.text for e in soup.select('table a')})
    return res

# get_social('albertogcmr')

In [9]:
# cálculo de tiempos: 

def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' %  (method.__name__, (te - ts) * 1000) )
        return result

    return timed

@timeit
def print_prueba(a, b):
    print(a, b)
    
print_prueba('hola', 'adios')

hola adios
'print_prueba'  0.26 ms


### Iteración para ampliar número de usuarios

A partir de la semilla inicial de 500 obtenemos casi 4000 usuarios de codewars

In [11]:
top500 = get_top_500()
total = set().union(top500)
len(total)

499

In [None]:

@timeit
def get_social_from_users_set(users_set): 
    res = users_set
    errors = []
    for i, user in enumerate(users_set): 
        try: 
            res = res.union(get_social(user))
        except: 
            errors.append(user)
            
    save_set_users(users=errors, filename='../errors/social-error.txt')
    return res

#

total = get_social_from_users_set(total)

In [12]:

len(total)

499

In [13]:
# Me añado a mí mismo

total = total.union({'albertogcmr', 'boyander'})
for user in get_social('albertogcmr'): 
    total = total.union(get_social(user))
len(total)

534

### Persistencia de datos en un txt

In [14]:
# save_set_users(users=total, filename='../output/usuarioscodewars-3947.txt')
total = load_set_users('../output/usuarioscodewars-3947.txt')

In [15]:
len(total)

3947

## Fase 2: Obtención de datos de usuario
### 2.1 API codewars

In [16]:
# GET user

def get_user_api(user='albertogcmr'): 
    ''' Obtenemos el json de la API de codewars '''
    url = 'https://www.codewars.com/api/v1/users/{}'.format(user)
    response = requests.get(url)
    return response.json()

# get_user_api('albertogcmr')

In [17]:
def get_value_from_json(user_json, value='username'): 
    ''' 
    value en nivel 1 del json: 
    possible values = ['username', 'honor', 'clan', 'leaderboardPosition', 'skills']
    '''
    return user_json[value]

def get_score_language(user_json, language): 
    try: 
        score = user_json['ranks']['languages'][language]['score']
    except: 
        score = 0
    return score


def get_scores(user_json): 
    res = {}
    for lang in get_languages(): 
        res[lang] = get_score_language(user_json, lang)
    return res

# get_score_language(user_json=alberto, language='sca')
# get_value_from_json(alberto, 'skills')
# get_scores(alberto)

### 2.2 Web Scrapping codewars

Get stats from user

In [18]:
def get_all_stats(user): 
    url = 'https://www.codewars.com/users/{}'.format(user)
    html = requests.get(url).content
    soup = BeautifulSoup(html, "lxml")

    x = {}
    x['username'] = user
    for s in soup.select('.stat-box div'): 
        if s.text.split(':')[0] != 'Profiles': 
            x[s.text.split(':')[0].lower()] = s.text.split(':')[1]
        else: 
            try: 
                for e in s.find_all('a', href=True): 
                    if 'github' in e['href']: 
                        x['github'] = e['href']
                    if 'linkedin' in e['href']: 
                        x['linkedin'] = e['href']
            except: 
                x['github'] = ''
                x['linkedin'] = ''
    return x

# get_all_stats('albertogcmr')

In [19]:
def get_languages(url='https://www.codewars.com/kata/latest/my-languages', ignore={'all', 'my languages'}): 
    html = requests.get(url).content
    soup = BeautifulSoup(html, "lxml")
    return {e.text.lower() for e in soup.select('#language_filter option')}.difference(ignore) 

languages = get_languages()
len(languages)

38

In [20]:
def get_row(user): 
    ''' 
    Crea un diccionario con los datos recopilados que se puede
    añadir como fila en nuestro dataframe de pandas
    '''
    user_json = get_user_api(user)
    res = get_all_stats(user)
    res.update(get_scores(user_json))
    return res

def create_row(data, new_user): 
    row = get_row(new_user)
    return data.append(other=row, ignore_index=True)

# ivan = get_row('ijcernicharo')
# alberto = get_row('albertogcmr')

### 2.3 Creación de DataFrame

In [25]:
# Por cada uno de los usuarios recopilados, creamos un registro en df_users
total_n = [u for u in total][:]
len(total_n)

3947

In [24]:
@timeit
def create_df(users): 
    # print(users)
    df_res = pd.DataFrame()
    errors = []
    for i, user in enumerate(users): 
        try: 
            print(i, '\t', user)
            df_res = create_row(data=df_res, new_user=user)
        except: 
            errors.append(user)
    try: 
        save_set_users(users=errors, filename='../errors/df-row-error.txt')
    except: 
        print('Error en la escritura del archivo de error')
        print(errors)
    
    return df_res

In [None]:
df_users = create_df(total_n)
df_users.head()

0 	 mr.tk
1 	 Caffeinatedbrew
2 	 rscharfer
3 	 tommur


In [None]:
df_users = create_row(data=df_users, new_user='ijcernicharo')
df_users = create_row(data=df_users, new_user='pablobarrio')
df_users.head()

In [None]:
df_users.shape

In [None]:
df_users.to_csv('../output/df-173columnas.txt')
df_users = pd.read_csv('../output/df-173columnas.txt')

In [None]:
'linkedin' in df_users.columns

In [None]:
df_users = create_row(data=df_users, new_user='albertogcmr')
df_users.head(7)

In [None]:
'linkedin' in df_users.columns

In [None]:
# df_users.to_csv('../output/dataframe.csv')
# df = pd.read_csv('../output/dataframe.csv')
df.head()

In [None]:
df.games.value_counts()


### 2.4 Limpieza de datos

Tenemos más de 100 columnas de las que sólo queremos las concernientes a sus datos de programación

In [None]:
# Nos vamos a quedar con las siguientes columnas
get_languages()
allies, clan, comments, followers, following, github, honor, last seen, 
leaderboard position # eliminar el #
member since, total completed kata, total languages trained, translations, username, avg. satisfaction rating, 
contributed kata, created, data structures, data types, fundamentals, graphs, kata approvals, 
total collected, total completions, total stars, skills, linkedin


## Fase 3: Persistencia de datos

In [None]:
Guarda