# **Dependencies import**

In [38]:
from pyspark.sql import functions as F, SparkSession, types as T
import requests

# **Creating spark session**

In [39]:
spark = (
    SparkSession.builder
        .appName('candidates_finder')
            .master('local[*]')
                .getOrCreate()
)

# **Defining configs**

In [40]:
COLUMNS_LIST = [
    "name",
    "company",
    "blog",
    "email",
    "bio",
    "public_repos",
    "followers",
    "following",
    "created_at"
]
COMPANY_PATTERN_TO_REMOVE = "@"
CSV_FILE = "followers_info.csv"
DATE_COLUMN = "created_at"
DATE_FORMAT = "dd/MM/yyyy"
FOLLOWERS_URL = 'https://api.github.com/users/calilisantos/followers'
FOLLOWER_KEY = 'url'
token = "PAT git hub token here"
headers = {
    "Accept": "application/vnd.github.v3+json",
    "Authorization": f"Bearer {token}"
}
schema = T.StructType([
    T.StructField("name", T.StringType(), True),
    T.StructField("company", T.StringType(), True),
    T.StructField("blog", T.StringType(), True),
    T.StructField("email", T.StringType(), True),
    T.StructField("bio", T.StringType(), True),
    T.StructField("public_repos", T.IntegerType(), True),
    T.StructField("followers", T.IntegerType(), True),
    T.StructField("following", T.IntegerType(), True),
    T.StructField("created_at", T.StringType(), True)
])



# **Creating get_followers_info function**

In [41]:
def get_followers_info(followers_url):
    try:
        response = requests.get(followers_url, headers=headers)
        response.raise_for_status()
        return {column: response.json()[column] for column in COLUMNS_LIST}
    except Exception as e:
        print(f"Error: {e}")
        return {column: None for column in COLUMNS_LIST}

# **Getting followers info**

In [42]:
followers_url_response = requests.get(FOLLOWERS_URL, headers=headers)
followers_url_response_json = followers_url_response.json()


followers_lists = list(map( lambda follower: follower[FOLLOWER_KEY], followers_url_response_json))

followers_info = list(map(get_followers_info, followers_lists))
followers_info


[{'name': 'Christian Deacon',
  'company': '@deaconn-net',
  'blog': 'https://deaconn.net/',
  'email': 'christianmdeacon@gmail.com',
  'bio': 'Software/network engineer and developer with a love for open source!\r\n\r\n@modcommunity @bestmods @bestserversio @gamecomio ',
  'public_repos': 144,
  'followers': 23062,
  'following': 81347,
  'created_at': '2014-01-26T22:43:05Z'},
 {'name': 'Levy Nunes',
  'company': None,
  'blog': 'https://www.linkedin.com/in/levyvix/',
  'email': 'levy.vix@gmail.com',
  'bio': 'Estatística - UFES',
  'public_repos': 105,
  'followers': 28,
  'following': 65,
  'created_at': '2014-10-30T22:52:20Z'},
 {'name': 'Leandro Teixeira',
  'company': None,
  'blog': '',
  'email': 'leandroteixeira3@gmail.com',
  'bio': 'Software Developer passionate about Machine Learning and Python. \r\n',
  'public_repos': 25,
  'followers': 32,
  'following': 50,
  'created_at': '2014-11-10T11:48:31Z'},
 {'name': "Fernando D'luccas",
  'company': None,
  'blog': '',
  'email'

# **Creating dataframe**

In [43]:

data = spark.createDataFrame(followers_info, schema=schema)
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+--------------------+
|                name|             company|                blog|               email|                 bio|public_repos|followers|following|          created_at|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+--------------------+
|    Christian Deacon|        @deaconn-net|https://deaconn.net/|christianmdeacon@...|Software/network ...|         144|    23062|    81347|2014-01-26T22:43:05Z|
|          Levy Nunes|                NULL|https://www.linke...|  levy.vix@gmail.com|  Estatística - UFES|         105|       28|       65|2014-10-30T22:52:20Z|
|    Leandro Teixeira|                NULL|                    |leandroteixeira3@...|Software Develope...|          25|       32|       50|2014-11-10T11:48:31Z|
|   Fernando D'luccas|            

# **Transforming company column**

In [44]:
data = data.withColumn(
    "company",
    F.regexp_replace(
        F.col("company"),
        COMPANY_PATTERN_TO_REMOVE,
        ""
    )
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+--------------------+
|                name|             company|                blog|               email|                 bio|public_repos|followers|following|          created_at|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+--------------------+
|    Christian Deacon|         deaconn-net|https://deaconn.net/|christianmdeacon@...|Software/network ...|         144|    23062|    81347|2014-01-26T22:43:05Z|
|          Levy Nunes|                NULL|https://www.linke...|  levy.vix@gmail.com|  Estatística - UFES|         105|       28|       65|2014-10-30T22:52:20Z|
|    Leandro Teixeira|                NULL|                    |leandroteixeira3@...|Software Develope...|          25|       32|       50|2014-11-10T11:48:31Z|
|   Fernando D'luccas|            

# **Transforming created_at column**

In [45]:
data = data.withColumn(
    DATE_COLUMN,
    F.date_format(
        F.col(DATE_COLUMN),
        DATE_FORMAT
    )
)

data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+----------+
|                name|             company|                blog|               email|                 bio|public_repos|followers|following|created_at|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+---------+---------+----------+
|    Christian Deacon|         deaconn-net|https://deaconn.net/|christianmdeacon@...|Software/network ...|         144|    23062|    81347|26/01/2014|
|          Levy Nunes|                NULL|https://www.linke...|  levy.vix@gmail.com|  Estatística - UFES|         105|       28|       65|30/10/2014|
|    Leandro Teixeira|                NULL|                    |leandroteixeira3@...|Software Develope...|          25|       32|       50|10/11/2014|
|   Fernando D'luccas|                NULL|                    |fernandodluccas@g...|         

# **Saving data to csv**

In [46]:
data.toPandas().to_csv(CSV_FILE, index=False)