In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [3]:
# Returns top names of top numNames people from year 
def getTopNames(numNames, year):
    fileName = "Names/yob" + str(year) + ".txt"
    data = pd.read_csv(fileName, names=['Name', 'Sex', 'Frequency'])
    topNames = data.head(numNames)
    return(topNames)

getTopNames(10, 1990)

Unnamed: 0,Name,Sex,Frequency
0,Jessica,F,46470
1,Ashley,F,45553
2,Brittany,F,36534
3,Amanda,F,34405
4,Samantha,F,25865
5,Sarah,F,25810
6,Stephanie,F,24859
7,Jennifer,F,22219
8,Elizabeth,F,20742
9,Lauren,F,20499


In [8]:
# Returns frequencies of name for each year (male and female)
def freqNames(name):
    frequency = {}
    for i in range(1880, 2016):
        fileName = "Names/yob" + str(i) + ".txt"
        data = pd.read_csv(fileName, names=['Name', 'Sex', 'Frequency'])
        temp = data.loc[data['Name'] == name]
        if(temp.empty): # Case where data does not have a row with that name
            frequency[i] = 0
        else:           # Case where row exists
            frequency[i] = temp.iat[0, 2]
    return frequency

print(freqNames("Bob"))

{1880: 46, 1881: 50, 1882: 59, 1883: 62, 1884: 50, 1885: 52, 1886: 66, 1887: 74, 1888: 59, 1889: 52, 1890: 53, 1891: 55, 1892: 61, 1893: 59, 1894: 48, 1895: 54, 1896: 45, 1897: 53, 1898: 61, 1899: 59, 1900: 98, 1901: 49, 1902: 64, 1903: 66, 1904: 65, 1905: 58, 1906: 69, 1907: 77, 1908: 77, 1909: 58, 1910: 103, 1911: 76, 1912: 141, 1913: 5, 1914: 217, 1915: 296, 1916: 304, 1917: 6, 1918: 5, 1919: 6, 1920: 710, 1921: 10, 1922: 787, 1923: 11, 1924: 7, 1925: 6, 1926: 9, 1927: 14, 1928: 11, 1929: 6, 1930: 13, 1931: 10, 1932: 16, 1933: 12, 1934: 8, 1935: 10, 1936: 8, 1937: 14, 1938: 7, 1939: 8, 1940: 5, 1941: 7, 1942: 7, 1943: 8, 1944: 6, 1945: 5, 1946: 2492, 1947: 5, 1948: 5, 1949: 6, 1950: 978, 1951: 806, 1952: 756, 1953: 5, 1954: 5, 1955: 6, 1956: 1020, 1957: 6, 1958: 2522, 1959: 8, 1960: 2720, 1961: 5, 1962: 5, 1963: 6, 1964: 1433, 1965: 1000, 1966: 5, 1967: 10, 1968: 478, 1969: 426, 1970: 5, 1971: 325, 1972: 220, 1973: 208, 1974: 170, 1975: 168, 1976: 139, 1977: 120, 1978: 110, 1979: 11

In [9]:
# Returns relative frequencies of name for each year (male and female)
def relativeFreqNames(name):
    frequency = {}
    sumPerYear = {}
    for i in range(1880, 2016):
        fileName = "Names/yob" + str(i) + ".txt"
        data = pd.read_csv(fileName, names=['Name', 'Sex', 'Frequency'])
        sumPerYear[i] = data['Frequency'].sum()
        temp = data.loc[data['Name'] == name]
        if(temp.empty):
            frequency[i] = 0
        else:
            frequency[i] = temp.iat[0, 2]
    
    for key in frequency:
        frequency[key] = frequency[key] / sumPerYear[key]
        
    return frequency

print(relativeFreqNames("Bob"))

{1880: 0.00022830823597145154, 1881: 0.0002594760659276788, 1882: 0.0002663248079301597, 1883: 0.0002857867201364401, 1884: 0.0002053700151563071, 1885: 0.00021589842809336776, 1886: 0.0002585001507917546, 1887: 0.0002991155879642355, 1888: 0.00019701209453909188, 1889: 0.00017996317676536954, 1890: 0.00017584546832956759, 1891: 0.0001918548879393041, 1892: 0.0001824288153790482, 1893: 0.00018141454145168532, 1894: 0.00014172168223636816, 1895: 0.00015383563517024478, 1896: 0.00012587905540356825, 1897: 0.00015275624138944772, 1898: 0.00015991149816101777, 1899: 0.00017392073341488938, 1900: 0.0002176268898008492, 1901: 0.00014169384875193744, 1902: 0.00016548842356762012, 1903: 0.00017313518972731208, 1904: 0.00016109205543549378, 1905: 0.00013681149027812832, 1906: 0.00016104149502521817, 1907: 0.0001654508535759946, 1908: 0.00015757441810018458, 1909: 0.00011345164905884033, 1910: 0.0001743637837956795, 1911: 0.00011796278274204488, 1912: 0.00014270402880799628, 1913: 4.397173848424

In [19]:
males = {}
females = {}

# Iterate through each file and create a temporary dataframe to hold names that are male and female
# Then go through the rows of each dataframe and store the name and relative frequency into males/females dictionaries
for i in range(1880, 2016):    
    fileName = "Names/yob" + str(i) + ".txt"
    data = pd.read_csv(fileName, names=['Name', 'Sex', 'Frequency'])
    temp = data[data.duplicated('Name', keep=False) == True] # Each temporary dataframe holds names that are 
                                                             # both female and male for each year
    
    # For each year calculate the sum and use that sum to calculate relative
    # frequency of each name each year
    sum = temp['Frequency'].sum()
    
    # Iterate through each name and store relative frequency in corresponding 
    # dictionary for each year
    for row in temp.itertuples():
        if(row[2] == 'M'):
            tempDict = {i:row[3] / sum}
            if(row[1] in males):
                males[row[1]].update(tempDict)
            else:
                males[row[1]] = tempDict
        else:
            tempDict = {i:row[3] / sum}
            if(row[1] in females):
                females[row[1]].update(tempDict)
            else:
                females[row[1]] = tempDict
                
                
                
                
            

{1880: 0.09586076112749332, 1881: 0.09592937392655151, 1882: 0.08624439370831942, 1883: 0.07961829054320192, 1884: 0.07353793611254719, 1885: 0.0626112823299749, 1886: 0.0608922680447146, 1887: 0.059613942752973346, 1888: 0.051686938246210255, 1889: 0.050165496842648885, 1890: 0.04742487714268183, 1891: 0.04549277920600884, 1892: 0.043201674728047874, 1893: 0.0413894173908572, 1894: 0.039309061411461566, 1895: 0.03894766407639734, 1896: 0.03859248160230635, 1897: 0.03538703100466359, 1898: 0.033822352848514495, 1899: 0.03324961470403562, 1900: 0.03268967260439809, 1901: 0.033012254585301194, 1902: 0.03204211225883316, 1903: 0.03214779195206565, 1904: 0.03159831018410263, 1905: 0.0297767482756455, 1906: 0.0287829921215963, 1907: 0.02970048238899928, 1908: 0.028104946795548695, 1909: 0.02802853410562061, 1910: 0.027261190778336757, 1911: 0.029422642194848136, 1912: 0.03303722953262539, 1913: 0.033138577760804304, 1914: 0.03411936382689287, 1915: 0.03202098253676285, 1916: 0.0316039946095