In [1]:
#import required packages
import numpy as np #this is the linear algebra package for working with matrices
import pandas as pd #pandas is a dataframe package that is useful for managing network attributes
import math #use this package to take the log of a scalar, use numpy to take the element-wise log of an array
import copy
import os

In [2]:
df = pd.read_csv('final_merged.csv',sep=",", header = None)
df.columns = ["index","Language", "Influenced By", "Influenced","Year","duplicate"]
df = df[["Language","Influenced By","Influenced","Year"]]
# whitespaces stripped for language data field
df['Language']=df['Language'].str.strip()
df

Unnamed: 0,Language,Influenced By,Influenced,Year
0,Assembly language,,,1947.0
1,Plankalkül,[Begriffsschrift],"[Superplan, ALGOL 58]",1948.0
2,Short Code,[ENIAC Short Code],"[Intermediate programming language, OMNIBAC Sy...",1950.0
3,G-code,,,1950.0
4,Superplan,[Plankalkül],[ALGOL 58],1951.0
...,...,...,...,...
691,Z notation,,,
692,Zebra Programming Language,[ANSI BASIC],,
693,Zeno,,,
694,ZOPL,,,


In [3]:
# number of duplicates
n = len(pd.unique(df['Language']))
print(n)

672


In [4]:
# drop duplicate data
df = df.drop_duplicates(subset=['Language','Influenced By','Influenced',"Year"])
df

Unnamed: 0,Language,Influenced By,Influenced,Year
0,Assembly language,,,1947.0
1,Plankalkül,[Begriffsschrift],"[Superplan, ALGOL 58]",1948.0
2,Short Code,[ENIAC Short Code],"[Intermediate programming language, OMNIBAC Sy...",1950.0
3,G-code,,,1950.0
4,Superplan,[Plankalkül],[ALGOL 58],1951.0
...,...,...,...,...
691,Z notation,,,
692,Zebra Programming Language,[ANSI BASIC],,
693,Zeno,,,
694,ZOPL,,,


In [5]:
def invalid_infl(node):
    return not (node in df['Language'].values)
def invalid_inflby(node):
    return not (node in df['Language'].values)

In [6]:
# drop languages that is in "Influenced By" or "Influenced" that is not a node and add valid dates to keep ordering of year
for index, row in df.iterrows():
    # check for validity of Influenced By
    if not pd.isna(row[1]):
        str1 = row[1][1:len(row[1])-1]
        arr = str1.split(", ")
        valid_inflby_arr = copy.deepcopy(arr)
        for i in range(len(arr)):
            if invalid_inflby(arr[i]):
                #valid_inflby_arr.pop(i)
                valid_inflby_arr.remove(arr[i])
        out = ", ".join(valid_inflby_arr)
        if not out:
            df.at[index,"Influenced By"] = float("nan")
        else:
            df.at[index,"Influenced By"] = "[" + out + "]"
    # check validity of Influenced
    if not pd.isna(row[2]):
        str2 = row[2][1:len(row[2])-1]
        arr = str2.split(", ")
        valid_infl_arr = copy.deepcopy(arr)
        for i in range(len(arr)):
            if invalid_inflby(arr[i]):
                #valid_infl_arr.pop(i)
                valid_infl_arr.remove(arr[i])
        out = ", ".join(valid_infl_arr)
        if not out:
            df.at[index,"Influenced"] = float("nan")
        else:
            df.at[index,"Influenced"] = "[" + out + "]"


In [10]:
for index, row in df.iterrows():
    # validate that when language A is influenced by language B, B has A in "Influenced", if not update
    if not pd.isna(row[1]):
        str1 = row[1][1:len(row[1])-1]
        arr1 = str1.split(", ")
        curr_lang = row[0]
        for i in range(len(arr1)):
            idx = df.index[df['Language']==arr1[i]].tolist()[0]
            if isinstance(df.at[idx,"Influenced"], str):
                infl_arr = df.at[idx,"Influenced"][1:len(df.at[idx,"Influenced"])-1].split(", ")
                if curr_lang not in infl_arr:
                    infl_arr.append(curr_lang)
                    new_infl = ", ".join(infl_arr)
                    df.at[idx,"Influenced"] = "["+new_infl+"]"
for index, row in df.iterrows():
    # similar for if language A influenced language B, B has A in "Influenced By", if not update
    if not pd.isna(row[2]):
        str2 = row[2][1:len(row[2])-1]
        arr2 = str2.split(", ")
        curr_lang = row[0]
        for i in range(len(arr2)):
            idx = df.index[df['Language']==arr2[i]].tolist()[0]
            if isinstance(df.at[idx,"Influenced By"], str):
                inflby_arr = df.at[idx,"Influenced By"][1:len(df.at[idx,"Influenced By"])-1].split(", ")
                if curr_lang not in inflby_arr:
                    inflby_arr.append(curr_lang)
                    new_inflby = ", ".join(inflby_arr)
                    df.at[idx,"Influenced By"] = "["+new_inflby+"]"


    #gets row index of certain col value
    #df.index[df['Language']=="Scala"].tolist()[0]

In [11]:
# df with rows without year data
df_year_null = df[df['Year'].isnull()][["Language","Influenced By","Influenced"]]
df_year_null

Unnamed: 0,Language,Influenced By,Influenced
422,A-0 System,,
423,A++,,
424,ABC ALGOL,,
425,ACC,,
426,Distributed Application Specification Language,,
...,...,...,...
691,Z notation,,
692,Zebra Programming Language,,
693,Zeno,,
694,ZOPL,,


In [12]:
df1 = df_year_null[df_year_null['Influenced By'].notnull()]
df1

Unnamed: 0,Language,Influenced By,Influenced
441,BETA,[Simula],
443,C/AL,"[Pascal, Object Pascal]",
444,Carbon,"[C++, Rust]",
449,Cg,[C],
477,F*,"[Coq, Lean, OCaml, Standard ML, ML, F Sharp]",
481,Flix,"[Go, Haskell, OCaml, Scala]",
494,Gosu,[Java],[Kotlin]
510,ISLISP,"[Common Lisp, Scheme]",
520,JScript .NET,"[JScript, ECMAScript]",
525,KUKA Robot Language,[Pascal],


In [13]:
df2 = df_year_null[df_year_null['Influenced'].notnull()]
df2

Unnamed: 0,Language,Influenced By,Influenced
494,Gosu,[Java],[Kotlin]
592,Pict,[ML],[Orc]
595,Pizza,[Java],[Scala]


In [14]:
# for languages that has no dates but is influenced by some language, input valid date to keep ordering
for index, row in df.iterrows():
    if pd.isna(row[3]) and not pd.isna(row[2]) and not pd.isna(row[1]):
        df.at[index,"Year"] = "2003.0"
    elif pd.isna(row[3]) and not pd.isna(row[1]):
        df.at[index,"Year"] = "2024.0"
    # for languages with no dates and not influenced by or influence (degree 0) set year as 1
    elif pd.isna(row[3]) and pd.isna(row[2]) and pd.isna(row[1]):
        df.at[index,"Year"] = "1.0"

In [15]:
df

Unnamed: 0,Language,Influenced By,Influenced,Year
0,Assembly language,,,1947.0
1,Plankalkül,,"[Superplan, ALGOL 58]",1948.0
2,Short Code,,,1950.0
3,G-code,,,1950.0
4,Superplan,[Plankalkül],[ALGOL 58],1951.0
...,...,...,...,...
691,Z notation,,,1.0
692,Zebra Programming Language,,,1.0
693,Zeno,,,1.0
694,ZOPL,,,1.0


In [16]:
convert_dict = {'Year': float}
df = df.astype(convert_dict)
sorted_df = df.sort_values(by=['Year'], ascending=True)
sorted_df

Unnamed: 0,Language,Influenced By,Influenced,Year
695,Z++,,,1.0
519,JEAN,,,1.0
518,Job Control Language,,,1.0
517,Jess,,,1.0
516,JavaFX Script,,,1.0
...,...,...,...,...
628,Script.NET,[JavaScript],,2024.0
561,Mirah,"[Ruby, Java, Boo]",,2024.0
637,Solidity,"[JavaScript, C++, Python]",,2024.0
520,JScript .NET,"[JScript, ECMAScript]",,2024.0


In [17]:
sorted_df.to_csv('final_cleaned.csv')