## Using Ground Truth

In [1]:
import pandas as pd

df = pd.read_csv( "output/ground_truth_with_RF_prediction.csv", delimiter = "\a" )

In [2]:
df = df[ [ "PR_Author_Name", "PR_Author_Username", "Ground_Truth" ] ]
df.head()

Unnamed: 0,PR_Author_Name,PR_Author_Username,Ground_Truth
0,Ingvar Jackal,IngvarJackal,"['Databases-Database Security', 'Data Structur..."
1,Jörg Lenhard,lenhard,"['Databases-Database Security', 'Error Handlin..."
2,Oscar Gustafsson,oscargus,"['Databases-Database Security', 'Error Handlin..."
3,Oscar Gustafsson,oscargus,"['Databases-Database Security', 'Error Handlin..."
4,Jörg Lenhard,lenhard,"['Databases-Database Security', 'Error Handlin..."


In [3]:
import ast
df["Ground_Truth"] = df["Ground_Truth"].apply(ast.literal_eval)

In [4]:
df = df.explode("Ground_Truth")
df.head()

Unnamed: 0,PR_Author_Name,PR_Author_Username,Ground_Truth
0,Ingvar Jackal,IngvarJackal,Databases-Database Security
0,Ingvar Jackal,IngvarJackal,Data Structure-Data Sorting
0,Ingvar Jackal,IngvarJackal,Error Handling-Exception Handling
1,Jörg Lenhard,lenhard,Databases-Database Security
1,Jörg Lenhard,lenhard,Error Handling-Exception Handling


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1719 entries, 0 to 572
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   PR_Author_Name      1599 non-null   object
 1   PR_Author_Username  1719 non-null   object
 2   Ground_Truth        1719 non-null   object
dtypes: object(3)
memory usage: 53.7+ KB


In [6]:
skills_using_GT = df.groupby( [ 'PR_Author_Username'] )['Ground_Truth'].agg(list).reset_index()
skills_using_GT.head()

Unnamed: 0,PR_Author_Username,Ground_Truth
0,AlexanderGirgis,"[Databases-Database Security, Error Handling-E..."
1,Ali96kz,"[Databases-Database Security, Error Handling-E..."
2,Andrew-Lev,"[Databases-Database Security, Error Handling-E..."
3,BJaroszkowski,"[Databases-Database Security, Data Structure-D..."
4,Brainsucker92,"[Databases-Database Security, Data Structure-D..."


In [7]:
skills_using_GT.to_csv("output/developer_skills_using_GT.csv", index = False, header = True, sep=",")

## Using RF Predictions

In [8]:
df = pd.read_csv( "output/ground_truth_with_RF_prediction.csv", delimiter = "\a" )

In [9]:
df = df[ [ "PR_Author_Name", "PR_Author_Username", "RF_Predictions" ] ]
df.head()

Unnamed: 0,PR_Author_Name,PR_Author_Username,RF_Predictions
0,Ingvar Jackal,IngvarJackal,"['Databases-Backup and Recovery', 'Application..."
1,Jörg Lenhard,lenhard,"['Databases-Backup and Recovery', 'Application..."
2,Oscar Gustafsson,oscargus,['Software Development and IT Operations-Confi...
3,Oscar Gustafsson,oscargus,"['Databases-Backup and Recovery', 'Application..."
4,Jörg Lenhard,lenhard,['Software Development and IT Operations-Confi...


In [10]:
import ast
df["RF_Predictions"] = df["RF_Predictions"].apply( ast.literal_eval )

In [11]:
df = df.explode("RF_Predictions")
df.head()

Unnamed: 0,PR_Author_Name,PR_Author_Username,RF_Predictions
0,Ingvar Jackal,IngvarJackal,Databases-Backup and Recovery
0,Ingvar Jackal,IngvarJackal,Application-Version Control
0,Ingvar Jackal,IngvarJackal,Software Development and IT Operations-Configu...
1,Jörg Lenhard,lenhard,Databases-Backup and Recovery
1,Jörg Lenhard,lenhard,Application-Version Control


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1719 entries, 0 to 572
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   PR_Author_Name      1599 non-null   object
 1   PR_Author_Username  1719 non-null   object
 2   RF_Predictions      1719 non-null   object
dtypes: object(3)
memory usage: 53.7+ KB


In [13]:
skills_using_RF = df.groupby( [ 'PR_Author_Username'] )['RF_Predictions'].agg(list).reset_index()
skills_using_RF.head()

Unnamed: 0,PR_Author_Username,RF_Predictions
0,AlexanderGirgis,"[Databases-Backup and Recovery, Application-Ve..."
1,Ali96kz,[Software Development and IT Operations-Config...
2,Andrew-Lev,[Software Development and IT Operations-Config...
3,BJaroszkowski,[Software Development and IT Operations-Config...
4,Brainsucker92,"[Databases-Backup and Recovery, Application-Ve..."


In [14]:
skills_using_RF.to_csv("output/developer_skills_using_RF.csv", index = False, header = True, sep=",")

## Use aggregated projects data

In [18]:
df = pd.read_csv( "output/GT_and_RF_predictions_all_repositories.csv", 
                delimiter = "\a" )

In [19]:
df = df[ [ "PR_Author_Name", "PR_Author_Username", "Ground_Truth" ] ]
df.head()

Unnamed: 0,PR_Author_Name,PR_Author_Username,Ground_Truth
0,Ingvar Jackal,IngvarJackal,"['Databases-Database Security', 'Data Structur..."
1,Jörg Lenhard,lenhard,"['Databases-Database Security', 'Error Handlin..."
2,Oscar Gustafsson,oscargus,"['Databases-Database Security', 'Error Handlin..."
3,Oscar Gustafsson,oscargus,"['Databases-Database Security', 'Error Handlin..."
4,Jörg Lenhard,lenhard,"['Databases-Database Security', 'Error Handlin..."


In [20]:
import ast
df["Ground_Truth"] = df["Ground_Truth"].apply(ast.literal_eval)

In [None]:
df = df[ [ "PR_Author_Name", "PR_Author_Username", "Ground_Truth" ] ]
df.head()

In [21]:
df = df.explode("Ground_Truth")
df.head()

Unnamed: 0,PR_Author_Name,PR_Author_Username,Ground_Truth
0,Ingvar Jackal,IngvarJackal,Databases-Database Security
0,Ingvar Jackal,IngvarJackal,Data Structure-Data Sorting
0,Ingvar Jackal,IngvarJackal,Error Handling-Exception Handling
1,Jörg Lenhard,lenhard,Databases-Database Security
1,Jörg Lenhard,lenhard,Error Handling-Exception Handling


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1497 entries, 0 to 498
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   PR_Author_Name      1422 non-null   object
 1   PR_Author_Username  1497 non-null   object
 2   Ground_Truth        1497 non-null   object
dtypes: object(3)
memory usage: 46.8+ KB


In [24]:
skills_using_RF = df.groupby( [ 'PR_Author_Username'] )['Ground_Truth'].agg(list).reset_index()
skills_using_RF.head()

Unnamed: 0,PR_Author_Username,Ground_Truth
0,AlexanderGirgis,"[Databases-Database Security, Error Handling-E..."
1,Andrew-Lev,"[Databases-Database Security, Error Handling-E..."
2,Brainsucker92,"[Databases-Database Security, Data Structure-D..."
3,CaptainDaVinci,"[Databases-Schema Design, Geographic Informati..."
4,CyraxSector,"[Databases-Database Security, Data Structure-D..."


In [25]:
skills_using_RF.to_csv("output/developer_skills_using_RF_all_repositories.csv", index = False, header = True, sep=",")

### using RF

In [28]:
df = pd.read_csv( "output/GT_and_RF_predictions_all_repositories.csv", delimiter = "\a" )

In [29]:
df = df[ [ "PR_Author_Name", "PR_Author_Username", "RF_Predictions" ] ]
df.head()

Unnamed: 0,PR_Author_Name,PR_Author_Username,RF_Predictions
0,Ingvar Jackal,IngvarJackal,"['Databases-Backup and Recovery', 'Application..."
1,Jörg Lenhard,lenhard,"['Databases-Backup and Recovery', 'Application..."
2,Oscar Gustafsson,oscargus,['Software Development and IT Operations-Confi...
3,Oscar Gustafsson,oscargus,"['Databases-Backup and Recovery', 'Application..."
4,Jörg Lenhard,lenhard,['Software Development and IT Operations-Confi...
