In [9]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [10]:
def read_train_csv():
    # Description: Reads the "training_data.csv" file using the pandas library and returns a DataFrame containing the data.
    # Returns: DataFrame containing the data from the CSV file.
    df = pd.read_csv("training_data.csv")
    df.columns = ['MonLastDon', 'NoDon', 'TotVolDon', 'MonFirstDon', 'DonMar2007']
    return df
read_train_csv()

Unnamed: 0,MonLastDon,NoDon,TotVolDon,MonFirstDon,DonMar2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
571,23,1,250,23,0
572,16,3,750,86,0
573,21,2,500,52,0
574,39,1,250,39,0


In [11]:
def read_test_csv():
    # Description: Reads the "test_data.csv" file using the pandas library and returns a DataFrame containing the data.
    # Returns: DataFrame containing the data from the CSV file.
    df = pd.read_csv("test_data.csv")
    return df
read_test_csv()

Unnamed: 0,MonLastDon,NoDon,MonFirstDon,AveDonPerPeriod
0,2,12,52,0.692308
1,21,7,38,0.552632
2,4,1,4,0.750000
3,11,11,38,0.868421
4,4,12,34,1.058824
...,...,...,...,...
111,11,9,33,0.818182
112,16,6,40,0.450000
113,16,3,19,0.473684
114,8,15,77,0.584416


In [13]:
def check_duplicates_train():
    # do not edit the predefined function name
    df = read_train_csv()
    # Description: calculates the number of duplicate rows using the duplicated() method.
    # Returns: The count of duplicate rows in the DataFrame.
    return df.duplicated().sum()
check_duplicates_train()

153

In [16]:
def check_null_values_train():
    # do not edit the predefined function name
    df = read_train_csv()
    # Description: checks for null values in each column using the isnull() method.
    # Returns: Series containing the sum of null values for each column.
    return df.isnull().sum()
check_null_values_train()

MonLastDon     0
NoDon          0
TotVolDon      0
MonFirstDon    0
DonMar2007     0
dtype: int64

In [17]:
def check_duplicates_test():
    # do not edit the predefined function name
    df = read_test_csv()
    # Description: calculates the number of duplicate rows using the duplicated() method.
    # Returns: The count of duplicate rows in the DataFrame.
    return df.duplicated().sum()
check_duplicates_test()

0

In [19]:
def check_null_values_test():
    # do not edit the predefined function name
    df = read_test_csv()
    # Description: checks for null values in each column using the isnull() method.
    # Returns: Series containing the sum of null values for each column.
    return df.isnull().sum()
check_null_values_test()

MonLastDon         0
NoDon              0
MonFirstDon        0
AveDonPerPeriod    0
dtype: int64

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
def ratio():
    df = read_train_csv()

    # Calculate the ratio of the column 'TotVolDon' to the column 'NoDon' element-wise
    ratio_totno = df["TotVolDon"]/df["NoDon"]
    # Return the calculated ratio
    return ratio_totno
ratio()

0      250.0
1      250.0
2      250.0
3      250.0
4      250.0
       ...  
571    250.0
572    250.0
573    250.0
574    250.0
575    250.0
Length: 576, dtype: float64

In [22]:
read_train_csv()

Unnamed: 0,MonLastDon,NoDon,TotVolDon,MonFirstDon,DonMar2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
571,23,1,250,23,0
572,16,3,750,86,0
573,21,2,500,52,0
574,39,1,250,39,0


In [28]:
def drop():
    df = read_train_csv()

    # Drop the column 'TotVolDon' from the DataFrame 'df' along the specified axis (axis=1) and update 'df' in-place
    df.drop("TotVolDon",axis=1,inplace=True)
    # Return the modified DataFrame 'df' with the specified column dropped
    return df

In [31]:
def split():
    df = drop()

    # Store the 'DonMar2007' column (target variable) in the variable 'lastcoltarget'
    lastcoltarget = df["DonMar2007"]

    # Drop the 'DonMar2007' column from the DataFrame 'df' along the specified axis (axis=1) and update 'df' in-place
    df.drop('DonMar2007',axis=1,inplace=True)


    # Calculate the number of periods since the first donation by dividing the 'MonFirstDon' column by 3
    # This assumes that each period is represented by 3 months, as indicated by the division by 3
    no_period_first_donation = df['MonFirstDon'] / 3

    # Calculate the average donation per period by dividing the 'NoDon' (Number of Donations) column by the 'no_period_first_donation'
    # This provides an estimate of how many donations were made on average during each period since the first donation
    df['AveDonPerPeriod'] = df['NoDon'] / no_period_first_donation

    # Insert the calculated 'AveDonPerPeriod' column at index 3 in the DataFrame 'df'
    df = df[["MonLastDon","NoDon","MonFirstDon","AveDonPerPeriod"]]

    # Create a copy of the modified DataFrame 'df' and assign it to 'X'
    X = df.copy()

    # Create a copy of the 'lastcoltarget' as the target variable 'Y'
    Y = lastcoltarget.copy()

    # Split the data into training and testing sets using the 'train_test_split' function
    # The 'test_size' parameter is set to 0.2, meaning 20% of the data will be used for testing, while 80% will be used for training
    # The 'random_state' parameter is set to 42, providing a seed for random number generation to ensure reproducibility
    X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

    # Return the training and testing sets for features (X_train, X_test) and target (y_train, y_test)
    return X_train, X_test, y_train, y_test
split()

(     MonLastDon  NoDon  MonFirstDon  AveDonPerPeriod
 437           2      6           45         0.400000
 63            2      2            4         1.500000
 208           4      4           43         0.279070
 60            4     17           71         0.718310
 15            2     15           49         0.918367
 ..          ...    ...          ...              ...
 71            2      6           28         0.642857
 106           4      5           28         0.535714
 270          11      5           50         0.300000
 435           4      9           55         0.490909
 102           2      5           34         0.441176
 
 [460 rows x 4 columns],
      MonLastDon  NoDon  MonFirstDon  AveDonPerPeriod
 234          14      2           14         0.428571
 118           2      2           11         0.545455
 346          23      2           28         0.214286
 498           9      1            9         0.333333
 402           4     13           39         1.000000
 

In [32]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [35]:
# LogisticRegression
def model():
    X_train, X_test, y_train, y_test = split()
    testdata = read_test_csv()
    # Declare a global variable named 'logistic_model'
    global logistic_model

    # Create an instance of the LogisticRegression class for modeling
    logistic_model = LogisticRegression()

    # Fit (train) the logistic regression model on the training data
    logistic_model.fit(X_train,y_train)

    # Make predictions using the trained logistic regression model on the test data
    y_pred = logistic_model.predict(X_test)

    # Calculate the accuracy of the predictions by comparing them with the actual 'y_test' values
    score_accuracy = accuracy_score(y_test,y_pred)

    # Return the accuracy rounded to two decimal places
    return score_accuracy
model()

0.7586206896551724

In [36]:
def predict():
    model()
    testdata = read_test_csv()
    X_train, X_test, y_train, y_test = split()

    # Use the trained logistic regression model to predict outcomes on the test data
    pred = logistic_model.predict(X_test)

    # Create a DataFrame 'predictions_df' to store the actual and predicted values
    predictions_df = pd.DataFrame(columns=["Actual Values","Predicted Values"])
    predictions_df["Actual Values"] = y_test
    predictions_df["Predicted Values"] = pred

    # Return the DataFrame containing actual and predicted values
    return predictions_df
predict()

Unnamed: 0,Actual Values,Predicted Values
234,0,0
118,0,0
346,0,0
498,0,0
402,1,0
...,...,...
75,1,0
355,1,0
244,0,0
272,0,0
