In [1]:
import numpy as np
import pandas as pd

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3718710/

In [2]:
df = pd.read_csv('country_language_contingency.csv', index_col=0)

In [3]:
row_totals = np.sum(df, axis=1)
col_totals = np.sum(df, axis=0)
cell_totals = np.sum(np.sum(df))

In [4]:
data = []
for _, row in df.iterrows():
    data.append((row / row_totals.ix[row.name]))

row_profiles = pd.DataFrame(data)

avg_row_profile = row_profiles.mean(axis=0)

print("Row profiles:\n{}\n".format(row_profiles))
print("Average row profile:\n{}".format(avg_row_profile))

Row profiles:
             English  French  Spanish  German  Italian
Canada         0.688   0.280    0.010   0.011    0.011
USA            0.730   0.031    0.190   0.008    0.041
England        0.798   0.074    0.038   0.031    0.059
Italy          0.017   0.013    0.011   0.015    0.944
Switzerland    0.015   0.222    0.020   0.648    0.095

Average row profile:
English    0.4496
French     0.1240
Spanish    0.0538
German     0.1426
Italian    0.2300
dtype: float64


----

In [5]:
data = []
for _,col in df.iteritems():
    data.append((col / col_totals.ix[col.name]))
   
column_profiles = pd.DataFrame(data)

avg_col_profile = row_totals / cell_totals

print("Column profiles:\n{}\n".format(column_profiles))
print("Average col profile:\n{}".format(avg_col_profile))

Column profiles:
           Canada       USA   England     Italy  Switzerland
English  0.306050  0.324733  0.354982  0.007562     0.006673
French   0.451613  0.050000  0.119355  0.020968     0.358065
Spanish  0.037175  0.706320  0.141264  0.040892     0.074349
German   0.015428  0.011220  0.043478  0.021038     0.908836
Italian  0.009565  0.035652  0.051304  0.820870     0.082609

Average col profile:
Canada         0.2
USA            0.2
England        0.2
Italy          0.2
Switzerland    0.2
dtype: float64


In [6]:
for _, row in row_profiles.iterrows():
    chisqd_from_avg = np.sqrt(np.sum(np.square(row - avg_row_profile) / avg_row_profile))
    print("{}: {}".format(row.name, chisqd_from_avg))

Canada: 0.8296401646386912
USA: 0.9336953580746634
England: 0.7136205964121708
Italy: 1.6971555483780416
Switzerland: 1.5457213408022732


In [7]:
np.sqrt(np.sum(np.square(row - avg_row_profile) / avg_row_profile))

1.5457213408022732

----

http://www.mathematica-journal.com/2010/09/an-introduction-to-correspondence-analysis/

In [8]:
correspondence_matrix = df / df.sum().sum()
correspondence_matrix

Unnamed: 0,English,French,Spanish,German,Italian
Canada,0.1376,0.056,0.002,0.0022,0.0022
USA,0.146,0.0062,0.038,0.0016,0.0082
England,0.1596,0.0148,0.0076,0.0062,0.0118
Italy,0.0034,0.0026,0.0022,0.003,0.1888
Switzerland,0.003,0.0444,0.004,0.1296,0.019


In [9]:
correspondence_matrix.sum(axis=1)

Canada         0.2
USA            0.2
England        0.2
Italy          0.2
Switzerland    0.2
dtype: float64

In [10]:
correspondence_matrix.sum(axis=0)

English    0.4496
French     0.1240
Spanish    0.0538
German     0.1426
Italian    0.2300
dtype: float64