In [4]:
import pandas as pd

# URL del archivo CSV hospedado en GitHub (raw)
url = "https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/repositories.csv"

# Carga directa desde la URL
df = pd.read_csv(url)

# Muestra las primeras filas del DataFrame
df

Unnamed: 0,repositories,iso2_code,year,quarter
0,30669781,US,2020,1
1,20179087,CN,2020,1
2,17912888,EU,2020,1
3,9254535,IN,2020,1
4,4171705,GB,2020,1
...,...,...,...,...
4705,151,AQ,2025,1
4706,135,PM,2025,1
4707,117,MH,2025,1
4708,111,MS,2025,1


## Rename columns

In [5]:
df.rename(columns={"repositories": "num_repos"}, inplace=True)
df

Unnamed: 0,num_repos,iso2_code,year,quarter
0,30669781,US,2020,1
1,20179087,CN,2020,1
2,17912888,EU,2020,1
3,9254535,IN,2020,1
4,4171705,GB,2020,1
...,...,...,...,...
4705,151,AQ,2025,1
4706,135,PM,2025,1
4707,117,MH,2025,1
4708,111,MS,2025,1


## Select South America

In [13]:
df[ (df["iso2_code"] == "AR") | (df["iso2_code"] == "PE") ]

Unnamed: 0,num_repos,iso2_code,year,quarter
31,608445,AR,2020,1
49,356948,PE,2020,1
244,670189,AR,2020,2
263,391794,PE,2020,2
460,737110,AR,2020,3
480,424103,PE,2020,3
678,802976,AR,2020,4
698,456011,PE,2020,4
898,867545,AR,2021,1
916,485312,PE,2021,1


In [28]:
# List of South American ISO2 codes
south_america_codes = [
    "AR", "BO", "BR", "CL", "CO", "EC",
    "GY", "PY", "PE", "SR", "UY", "VE", "GF"  # GF optional (French Guiana)
]

# Filter DataFrame
df_south_america = df[df["iso2_code"].isin(south_america_codes)]
df_south_america

Unnamed: 0,num_repos,iso2_code,year,quarter
5,3602025,BR,2020,1
26,678828,CO,2020,1
31,608445,AR,2020,1
49,356948,PE,2020,1
50,353627,CL,2020,1
...,...,...,...,...
4566,257655,UY,2025,1
4583,126185,PY,2025,1
4651,7650,GY,2025,1
4656,5915,SR,2025,1


In [29]:
df_south_america.shape

(273, 4)

## Sort Dataset

In [30]:
df[ ["quarter", "year"] ]

Unnamed: 0,quarter,year
0,1,2020
1,1,2020
2,1,2020
3,1,2020
4,1,2020
...,...,...
4705,1,2025
4706,1,2025
4707,1,2025
4708,1,2025


In [31]:
df

Unnamed: 0,num_repos,iso2_code,year,quarter
0,30669781,US,2020,1
1,20179087,CN,2020,1
2,17912888,EU,2020,1
3,9254535,IN,2020,1
4,4171705,GB,2020,1
...,...,...,...,...
4705,151,AQ,2025,1
4706,135,PM,2025,1
4707,117,MH,2025,1
4708,111,MS,2025,1


In [32]:
df_south_america.columns

Index(['num_repos', 'iso2_code', 'year', 'quarter'], dtype='object')

In [33]:
# Reorder columns
df_south_america = df_south_america[ ["iso2_code", "year", "quarter", "num_repos"] ]
df_south_america

Unnamed: 0,iso2_code,year,quarter,num_repos
5,BR,2020,1,3602025
26,CO,2020,1,678828
31,AR,2020,1,608445
49,PE,2020,1,356948
50,CL,2020,1,353627
...,...,...,...,...
4566,UY,2025,1,257655
4583,PY,2025,1,126185
4651,GY,2025,1,7650
4656,SR,2025,1,5915


Unnamed: 0,index,iso2_code,year,quarter,num_repos
0,5,BR,2020,1,3602025
1,26,CO,2020,1,678828
2,31,AR,2020,1,608445
3,49,PE,2020,1,356948
4,50,CL,2020,1,353627
...,...,...,...,...,...
268,4566,UY,2025,1,257655
269,4583,PY,2025,1,126185
270,4651,GY,2025,1,7650
271,4656,SR,2025,1,5915


Unnamed: 0,iso2_code,year,quarter,num_repos
0,BR,2020,1,3602025
1,CO,2020,1,678828
2,AR,2020,1,608445
3,PE,2020,1,356948
4,CL,2020,1,353627
...,...,...,...,...
268,UY,2025,1,257655
269,PY,2025,1,126185
270,GY,2025,1,7650
271,SR,2025,1,5915


In [37]:
df_south_america = df_south_america.sort_values(
    by=["iso2_code", "year", "quarter"],
    ascending=[True, True, True]
)
df_south_america

Unnamed: 0,iso2_code,year,quarter,num_repos
2,AR,2020,1,608445
15,AR,2020,2,670189
28,AR,2020,3,737110
41,AR,2020,4,802976
54,AR,2021,1,867545
...,...,...,...,...
214,VE,2024,1,321927
227,VE,2024,2,339253
240,VE,2024,3,353254
253,VE,2024,4,362967


In [43]:
df_south_america.reset_index(inplace=True)
df_south_america

Unnamed: 0,index,iso2_code,year,quarter,num_repos
0,2,AR,2020,1,608445
1,15,AR,2020,2,670189
2,28,AR,2020,3,737110
3,41,AR,2020,4,802976
4,54,AR,2021,1,867545
...,...,...,...,...,...
268,214,VE,2024,1,321927
269,227,VE,2024,2,339253
270,240,VE,2024,3,353254
271,253,VE,2024,4,362967


In [44]:
df_south_america.drop("index", axis=1, inplace=True)
df_south_america

Unnamed: 0,iso2_code,year,quarter,num_repos
0,AR,2020,1,608445
1,AR,2020,2,670189
2,AR,2020,3,737110
3,AR,2020,4,802976
4,AR,2021,1,867545
...,...,...,...,...
268,VE,2024,1,321927
269,VE,2024,2,339253
270,VE,2024,3,353254
271,VE,2024,4,362967


In [41]:
df_south_america.shape

(273, 4)

In [42]:
df_south_america["iso2_code"].nunique()

13

In [38]:
df_south_america.iso2_code.nunique()

13

In [39]:
df_south_america.year.nunique()

6

In [40]:
df_south_america.quarter.nunique()

4

In [61]:
df_south_america["year"].astype(int)

0      2020
1      2020
2      2020
3      2020
4      2021
       ... 
268    2024
269    2024
270    2024
271    2024
272    2025
Name: year, Length: 273, dtype: int32

In [69]:
lm_df =  df_south_america[["iso2_code", "year", "quarter", "num_repos"]].copy()
lm_df

Unnamed: 0,iso2_code,year,quarter,num_repos
0,AR,2020,1,608445
1,AR,2020,2,670189
2,AR,2020,3,737110
3,AR,2020,4,802976
4,AR,2021,1,867545
...,...,...,...,...
268,VE,2024,1,321927
269,VE,2024,2,339253
270,VE,2024,3,353254
271,VE,2024,4,362967


In [45]:
import pandas as pd

# Ensure correct order and types
df_south_america = df_south_america[["iso2_code", "year", "quarter", "num_repos"]].copy()
df_south_america["year"] = df_south_america["year"].astype(int)
df_south_america["quarter"] = df_south_america["quarter"].astype(int)

# Define the full cartesian product (all combos)
all_iso2   = sorted(df_south_america["iso2_code"].unique())
all_years  = sorted(df_south_america["year"].unique())
all_quarts = [1, 2, 3, 4]   # or: sorted(df_south_america["quarter"].unique())

In [63]:
all_years

[2020, 2021, 2022, 2023, 2024, 2025]

In [64]:
full_index = pd.MultiIndex.from_product(
    [all_iso2, all_years, all_quarts],
    names=["iso2_code", "year", "quarter"]
)
full_index

MultiIndex([('AR', 2020, 1),
            ('AR', 2020, 2),
            ('AR', 2020, 3),
            ('AR', 2020, 4),
            ('AR', 2021, 1),
            ('AR', 2021, 2),
            ('AR', 2021, 3),
            ('AR', 2021, 4),
            ('AR', 2022, 1),
            ('AR', 2022, 2),
            ...
            ('VE', 2023, 3),
            ('VE', 2023, 4),
            ('VE', 2024, 1),
            ('VE', 2024, 2),
            ('VE', 2024, 3),
            ('VE', 2024, 4),
            ('VE', 2025, 1),
            ('VE', 2025, 2),
            ('VE', 2025, 3),
            ('VE', 2025, 4)],
           names=['iso2_code', 'year', 'quarter'], length=312)

In [70]:
df_south_america.set_index(["iso2_code", "year", "quarter"]).reindex(full_index).reset_index()

Unnamed: 0,iso2_code,year,quarter,num_repos
0,AR,2020,1,608445.0
1,AR,2020,2,670189.0
2,AR,2020,3,737110.0
3,AR,2020,4,802976.0
4,AR,2021,1,867545.0
...,...,...,...,...
307,VE,2024,4,362967.0
308,VE,2025,1,380857.0
309,VE,2025,2,
310,VE,2025,3,


In [None]:
df_south_america.set_index(["iso2_code", "year", "quarter"]).reindex(full_index)

In [50]:
# Reindex to the full panel
balanced = (
    df_south_america
    .set_index(["iso2_code", "year", "quarter"])
    .reindex(full_index)
    .reset_index()
)
balanced

Unnamed: 0,iso2_code,year,quarter,num_repos
0,AR,2020,1,608445.0
1,AR,2020,2,670189.0
2,AR,2020,3,737110.0
3,AR,2020,4,802976.0
4,AR,2021,1,867545.0
...,...,...,...,...
307,VE,2024,4,362967.0
308,VE,2025,1,380857.0
309,VE,2025,2,
310,VE,2025,3,


In [55]:
balanced[balanced.num_repos.isna()]

Unnamed: 0,iso2_code,year,quarter,num_repos
21,AR,2025,2,
22,AR,2025,3,
23,AR,2025,4,
45,BO,2025,2,
46,BO,2025,3,
47,BO,2025,4,
69,BR,2025,2,
70,BR,2025,3,
71,BR,2025,4,
93,CL,2025,2,


In [56]:
# Optional: fill missing outcomes (choose ONE)
# If "missing means zero"
balanced["num_repos"] = balanced["num_repos"].fillna(0).astype(int)
balanced

Unnamed: 0,iso2_code,year,quarter,num_repos
0,AR,2020,1,608445
1,AR,2020,2,670189
2,AR,2020,3,737110
3,AR,2020,4,802976
4,AR,2021,1,867545
...,...,...,...,...
307,VE,2024,4,362967
308,VE,2025,1,380857
309,VE,2025,2,0
310,VE,2025,3,0


In [75]:
balanced.to_csv("../_data/github_repos_data_2020_2015_sa.csv")

In [76]:
type(balanced)

pandas.core.frame.DataFrame

In [77]:
pip install python-docx

Note: you may need to restart the kernel to use updated packages.




In [78]:
from docx import Document

# Create a new Word document
doc = Document()
doc.add_heading('Balanced GitHub Repos Data (South America) 2020 -2025', level=1)

# Add a table with the DataFrame content
table = doc.add_table(rows=1, cols=len(balanced.columns))

# Add header row
hdr_cells = table.rows[0].cells
for i, col_name in enumerate(balanced.columns):
    hdr_cells[i].text = str(col_name)

# Add data rows
for row in balanced.itertuples(index=False):
    row_cells = table.add_row().cells
    for i, value in enumerate(row):
        row_cells[i].text = str(value)

# Save the document
doc.save("../_data/github_repos_data_2020_2015_sa.docx")