In [1]:
import pandas as pd
import numpy as np

In [2]:
def insert_nan(df, missing_percentage):
    # Make a copy of the dataframe
    df_nan = df.copy()
    
    # Calculate the number of missing values to insert
    missing_count = int(missing_percentage * df_nan.size)
    
    # Select random indices for the missing values
    indices = np.random.choice(df_nan.index.values, missing_count, replace=True)
    
    # Select random columns for the missing values
    columns = np.random.choice(df_nan.columns.values, missing_count, replace=True)
    
    # Insert the missing values
    for i, c in zip(indices, columns):
        df_nan.loc[i, c] = np.nan
    
    return df_nan


In [3]:
df = pd.read_csv('clean_fifa_worldcup_matches.csv')
print(df)
df_nan = insert_nan(df, 0.5)
print('--------------------------------\n-------------------------------')
print(df_nan)

      HomeTeam       AwayTeam  Year  HomeGoals  AwayGoals  TotalGoals
0       France         Mexico  1930          4          1           5
1      Uruguay      Argentina  1930          4          2           6
2      Uruguay     Yugoslavia  1930          6          1           7
3    Argentina  United States  1930          6          1           7
4     Paraguay        Belgium  1930          1          0           1
..         ...            ...   ...        ...        ...         ...
895     Brazil     Costa Rica  2018          2          0           2
896     Serbia    Switzerland  2018          1          2           3
897     Serbia         Brazil  2018          0          2           2
898     France           Peru  2018          1          0           1
899     Brazil        Belgium  2018          1          2           3

[900 rows x 6 columns]
--------------------------------
-------------------------------
     HomeTeam     AwayTeam    Year  HomeGoals  AwayGoals  TotalGoals
0 

In [4]:
def insert_3nan(df, mcar_percentage, mar_percentage, mnar_percentage):
    """Inserts NaN values into a Pandas dataframe.

    Args:
        df: A Pandas dataframe.
        mcar_percentage: The percentage of Missing Completely at Random (MCAR) values to be inserted.
        mar_percentage: The percentage of Missing at Random (MAR) values to be inserted.
        mnar_percentage: The percentage of Missing Not at Random (MNAR) values to be inserted.

    Returns:
        A tuple of three dataframes, each with a different type of missing values.
    """
    # Make copies of the original dataframe
    df_mcar = df.copy()
    df_mar = df.copy()
    df_mnar = df.copy()
    
    # Calculate the number of missing values to insert for each dataframe
    mcar_count = int(mcar_percentage * df.size)
    mar_count = int(mar_percentage * df.size)
    mnar_count = int(mnar_percentage * df.size)
    
    # Insert MCAR values
    # Select random indices and columns for the missing values
    mcar_indices = np.random.choice(df_mcar.index.values, mcar_count, replace=True)
    mcar_columns = np.random.choice(df_mcar.columns.values, mcar_count, replace=True)
    # Insert the missing values
    for i, c in zip(mcar_indices, mcar_columns):
        df_mcar.loc[i, c] = np.nan
    
    # Insert MAR values
    # Select random columns for the missing values
    mar_columns = np.random.choice(df_mar.columns.values, mar_count, replace=True)
    # Insert the missing values at random indices for the selected columns
    for c in mar_columns:
        mar_indices = np.random.choice(df_mar[c].index.values, mar_count, replace=True)
        for i in mar_indices:
            df_mar.loc[i, c] = np.nan
    
    # Insert MNAR values
    # Select random rows for the missing values
    mnar_indices = np.random.choice(df_mnar.index.values, mnar_count, replace=True)
    # Insert the missing values at random columns for the selected rows
    for i in mnar_indices:
        mnar_columns = np.random.choice(df_mnar.columns.values, mnar_count, replace=True)
        for c in mnar_columns:
            df_mnar.loc[i, c] = np.nan
    
    return df_mcar, df_mar, df_mnar

This method is useful for generating different types of missing values in a Pandas dataframe, and the missing values are inserted according to their corresponding definitions.

Here is a brief explanation of the three types of missing values and how they are generated in this method:

Missing Completely at Random (MCAR): These missing values are completely random and are not related to any other variables in the data. In this method, MCAR values are inserted by randomly selecting indices and columns for the missing values.
Missing at Random (MAR): These missing values are not completely random, but they are not related to the values of the missing variables themselves. In this method, MAR values are inserted by randomly selecting columns for the missing values and then randomly selecting indices for the selected columns.
Missing Not at Random (MNAR): These missing values are not random and are related to the values of the missing variables themselves. In this method, MNAR values are inserted by randomly selecting rows for the missing values and then randomly selecting columns for the selected rows.

In [5]:
fifa23 = pd.read_csv('./data/fifa23.csv')
fifa23

Unnamed: 0,Known As,Full Name,Overall,Potential,Value(in Euro),Positions Played,Best Position,Nationality,Image Link,Age,...,LM Rating,CM Rating,RM Rating,LWB Rating,CDM Rating,RWB Rating,LB Rating,CB Rating,RB Rating,GK Rating
0,L. Messi,Lionel Messi,91,91,54000000,RW,CAM,Argentina,https://cdn.sofifa.net/players/158/023/23_60.png,35,...,91,88,91,67,66,67,62,53,62,22
1,K. Benzema,Karim Benzema,91,91,64000000,"CF,ST",CF,France,https://cdn.sofifa.net/players/165/153/23_60.png,34,...,89,84,89,67,67,67,63,58,63,21
2,R. Lewandowski,Robert Lewandowski,91,91,84000000,ST,ST,Poland,https://cdn.sofifa.net/players/188/545/23_60.png,33,...,86,83,86,67,69,67,64,63,64,22
3,K. De Bruyne,Kevin De Bruyne,91,91,107500000,"CM,CAM",CM,Belgium,https://cdn.sofifa.net/players/192/985/23_60.png,31,...,91,91,91,82,82,82,78,72,78,24
4,K. Mbappé,Kylian Mbappé,91,95,190500000,"ST,LW",ST,France,https://cdn.sofifa.net/players/231/747/23_60.png,23,...,92,84,92,70,66,70,66,57,66,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,D. Collins,Darren Collins,47,56,110000,"ST,RM",CAM,Republic of Ireland,https://cdn.sofifa.net/players/243/725/23_60.png,21,...,50,44,50,41,38,41,40,36,40,15
18535,Yang Dejiang,Dejiang Yang,47,57,90000,CDM,CDM,China PR,https://cdn.sofifa.net/players/261/933/23_60.png,17,...,45,45,45,47,48,47,49,49,49,15
18536,L. Mullan,Liam Mullan,47,67,130000,CM,RM,Northern Ireland,https://cdn.sofifa.net/players/267/823/23_60.png,18,...,52,49,52,46,44,46,46,42,46,17
18537,D. McCallion,Daithí McCallion,47,61,100000,CB,CB,Republic of Ireland,https://cdn.sofifa.net/players/267/824/23_60.png,17,...,33,33,33,44,42,44,47,49,47,15


In [6]:
fifa23.columns

Index(['Known As', 'Full Name', 'Overall', 'Potential', 'Value(in Euro)',
       'Positions Played', 'Best Position', 'Nationality', 'Image Link', 'Age',
       'Height(in cm)', 'Weight(in kg)', 'TotalStats', 'BaseStats',
       'Club Name', 'Wage(in Euro)', 'Release Clause', 'Club Position',
       'Contract Until', 'Club Jersey Number', 'Joined On', 'On Loan',
       'Preferred Foot', 'Weak Foot Rating', 'Skill Moves',
       'International Reputation', 'National Team Name',
       'National Team Image Link', 'National Team Position',
       'National Team Jersey Number', 'Attacking Work Rate',
       'Defensive Work Rate', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'Freekick Accuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Powe

In [7]:
fifa23 = fifa23.drop(columns={'Image Link', 'Club Jersey Number', 'Joined On', 'On Loan', 'Contract Until'})
fifa23.columns

Index(['Known As', 'Full Name', 'Overall', 'Potential', 'Value(in Euro)',
       'Positions Played', 'Best Position', 'Nationality', 'Age',
       'Height(in cm)', 'Weight(in kg)', 'TotalStats', 'BaseStats',
       'Club Name', 'Wage(in Euro)', 'Release Clause', 'Club Position',
       'Preferred Foot', 'Weak Foot Rating', 'Skill Moves',
       'International Reputation', 'National Team Name',
       'National Team Image Link', 'National Team Position',
       'National Team Jersey Number', 'Attacking Work Rate',
       'Defensive Work Rate', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'Freekick Accuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
       'Aggression', 'Interceptio

In [10]:
fifa23_categorical = fifa23[['Known As', 'Full Name', 'Positions Played', 'Best Position', 'Nationality', 'Club Name', 'Club Position', 'Preferred Foot', 'Weak Foot Rating', 'National Team Name',
       'National Team Image Link', 'National Team Position',
       'National Team Jersey Number', 'Attacking Work Rate',
       'Defensive Work Rate']]
fifa23_categorical.columns

Index(['Known As', 'Full Name', 'Positions Played', 'Best Position',
       'Nationality', 'Club Name', 'Club Position', 'Preferred Foot',
       'Weak Foot Rating', 'National Team Name', 'National Team Image Link',
       'National Team Position', 'National Team Jersey Number',
       'Attacking Work Rate', 'Defensive Work Rate'],
      dtype='object')

In [11]:
fifa23_categorical

Unnamed: 0,Known As,Full Name,Positions Played,Best Position,Nationality,Club Name,Club Position,Preferred Foot,Weak Foot Rating,National Team Name,National Team Image Link,National Team Position,National Team Jersey Number,Attacking Work Rate,Defensive Work Rate
0,L. Messi,Lionel Messi,RW,CAM,Argentina,Paris Saint-Germain,RW,Left,4,Argentina,https://cdn.sofifa.net/flags/ar.png,RW,10,Low,Low
1,K. Benzema,Karim Benzema,"CF,ST",CF,France,Real Madrid CF,CF,Right,4,France,https://cdn.sofifa.net/flags/fr.png,ST,19,Medium,Medium
2,R. Lewandowski,Robert Lewandowski,ST,ST,Poland,FC Barcelona,ST,Right,4,Poland,https://cdn.sofifa.net/flags/pl.png,ST,9,High,Medium
3,K. De Bruyne,Kevin De Bruyne,"CM,CAM",CM,Belgium,Manchester City,CM,Right,5,Belgium,https://cdn.sofifa.net/flags/be.png,RF,7,High,High
4,K. Mbappé,Kylian Mbappé,"ST,LW",ST,France,Paris Saint-Germain,ST,Right,4,France,https://cdn.sofifa.net/flags/fr.png,ST,10,High,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,D. Collins,Darren Collins,"ST,RM",CAM,Republic of Ireland,Sligo Rovers,RES,Right,3,-,-,-,-,Medium,Medium
18535,Yang Dejiang,Dejiang Yang,CDM,CDM,China PR,Guangzhou FC,SUB,Right,3,-,-,-,-,Medium,Medium
18536,L. Mullan,Liam Mullan,CM,RM,Northern Ireland,Derry City,SUB,Right,3,-,-,-,-,High,Medium
18537,D. McCallion,Daithí McCallion,CB,CB,Republic of Ireland,Derry City,SUB,Right,3,-,-,-,-,Medium,Medium


In [12]:
fifa23 = fifa23.drop(columns={'Known As', 'Full Name', 'Positions Played', 'Best Position', 'Nationality', 'Club Name', 'Club Position', 'Preferred Foot', 'Weak Foot Rating', 'National Team Name',
       'National Team Image Link', 'National Team Position',
       'National Team Jersey Number', 'Attacking Work Rate',
       'Defensive Work Rate'})
fifa23

Unnamed: 0,Overall,Potential,Value(in Euro),Age,Height(in cm),Weight(in kg),TotalStats,BaseStats,Wage(in Euro),Release Clause,...,LM Rating,CM Rating,RM Rating,LWB Rating,CDM Rating,RWB Rating,LB Rating,CB Rating,RB Rating,GK Rating
0,91,91,54000000,35,169,67,2190,452,195000,99900000,...,91,88,91,67,66,67,62,53,62,22
1,91,91,64000000,34,185,81,2147,455,450000,131199999,...,89,84,89,67,67,67,63,58,63,21
2,91,91,84000000,33,185,81,2205,458,420000,172200000,...,86,83,86,67,69,67,64,63,64,22
3,91,91,107500000,31,181,70,2303,483,350000,198900000,...,91,91,91,82,82,82,78,72,78,24
4,91,95,190500000,23,182,73,2177,470,230000,366700000,...,92,84,92,70,66,70,66,57,66,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18534,47,56,110000,21,174,68,1287,274,500,193000,...,50,44,50,41,38,41,40,36,40,15
18535,47,57,90000,17,175,60,1289,267,500,158000,...,45,45,45,47,48,47,49,49,49,15
18536,47,67,130000,18,170,65,1333,277,500,332000,...,52,49,52,46,44,46,46,42,46,17
18537,47,61,100000,17,178,65,1113,226,500,218000,...,33,33,33,44,42,44,47,49,47,15


In [15]:
import statsmodels.api as sm

# Split the dataset into predictor variables (X) and the dependent variable (y)
X = fifa23.drop('Overall', axis=1)
y = fifa23['Overall']

# Fit a Poisson regression model on the full dataset
model = sm.Poisson(y, X).fit()

# Generate datasets with missing values
df_mcar, df_mar, df_mnar = insert_3nan(fifa23, 0.5, 0.5, 0.5)

# Split the datasets with missing values into predictor variables (X_mcar, X_mar, X_mnar) and the dependent variable (y_mcar, y_mar, y_mnar)
X_mcar = df_mcar.drop('Overall', axis=1)
y_mcar = df_mcar['Overall']
X_mar = df_mar.drop('Overall', axis=1)
y_mar = df_mar['Overall']
X_mnar = df_mnar.drop('Overall', axis=1)
y_mnar = df_mnar['Overall']

# Fit Poisson regression models on the datasets with missing values
model_mcar = sm.Poisson(y_mcar, X_mcar).fit()
model_mar = sm.Poisson(y_mar, X_mar).fit()
model_mnar = sm.Poisson(y_mnar, X_mnar).fit()

  return stats.poisson.cdf(y, np.exp(X))
  L = np.exp(np.dot(X,params) + exposure + offset)
  return -np.dot(L*X.T, X)
  L = np.exp(np.dot(X,params) + offset + exposure)


Optimization terminated successfully.
         Current function value: nan
         Iterations 3


KeyboardInterrupt: 