# Part 1 - Stock Change-GMM

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 

from statsmodels.sandbox.regression.gmm import IV2SLS
from statsmodels.sandbox.regression.gmm import GMM

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/annwanginnt/predictive-moduling/main/midterm_partone.csv')
df.head()

Unnamed: 0,Constant,Stock Change,Inventory Turnover,Operating Profit,Interaction Effect,Current Ratio,Quick Ratio,Debt Asset Ratio
0,1,0.870332,1.795946,0.115846,0.208053,1.672527,0.255171,0.473317
1,1,-0.047347,1.395501,0.436967,0.609788,1.637261,0.221763,0.489967
2,1,0.001176,1.664563,0.541016,0.900555,1.640619,0.189141,0.374269
3,1,-0.9012,1.605738,0.539399,0.866133,1.436221,0.131944,0.224399
4,1,-0.176353,1.591451,0.539938,0.859285,1.43314,0.183095,0.213446


In [4]:
y_vals = np.array(df['Stock Change'])
x_vals = np.array(df[['Inventory Turnover','Operating Profit', 'Interaction Effect']])
iv_vals = np.array(df[['Current Ratio','Quick Ratio','Debt Asset Ratio']])

# defining the Custom GMM class with the delta term
class CustomGMMWithDelta(GMM):
    def momcond(self, params):
        # define parameters: Including delta as the last parameter
        p0, p1, p2, p3, delta = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument

        # Original errors
        error0 = endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2] 
        error1 = error0 * exog[:, 1]  # Interaction with Operating Profit
        error2 = error0 * exog[:, 2]  # Interaction with Interaction Effect

        # Errors incorporating instruments and delta
        error3 = (error0 - delta) * inst[:, 0]  # Interaction with Current Ratio adjusted by delta
        error4 = (error0 - delta) * inst[:, 1]  # Interaction with Quick Ratio adjusted by delta
        error5 = (error0 - delta) * inst[:, 2]  # Interaction with Debt Asset Ratio adjusted by delta

        g = np.column_stack((error0, error1, error2, error3, error4, error5))
        return g

# Initial parameter estimates including delta
beta0_with_delta = np.array([0.1, 0.1, 0.1, 0.1, 0.1])  # Added initial value for delta

# Running the custom GMM model with the delta term
res_with_delta = CustomGMMWithDelta(endog=y_vals, exog=x_vals, instrument=iv_vals, k_moms=6, k_params=5).fit(beta0_with_delta)

res_with_delta.summary()


Optimization terminated successfully.
         Current function value: 0.000005
         Iterations: 11
         Function evaluations: 16
         Gradient evaluations: 16
Optimization terminated successfully.
         Current function value: 0.000205
         Iterations: 8
         Function evaluations: 13
         Gradient evaluations: 13
Optimization terminated successfully.
         Current function value: 0.000205
         Iterations: 4
         Function evaluations: 8
         Gradient evaluations: 8
Optimization terminated successfully.
         Current function value: 0.000205
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5


0,1,2,3
Dep. Variable:,y,Hansen J:,0.3475
Model:,CustomGMMWithDelta,Prob (Hansen J):,0.556
Method:,GMM,,
Date:,"Sun, 12 Nov 2023",,
Time:,13:39:12,,
No. Observations:,1696,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
p 0,-0.0139,0.024,-0.588,0.556,-0.060,0.033
p 1,0.0009,0.001,1.217,0.224,-0.001,0.002
p 2,-0.1103,0.032,-3.416,0.001,-0.174,-0.047
p 3,0.0012,0.000,2.782,0.005,0.000,0.002
p 4,-0.0048,0.009,-0.534,0.594,-0.022,0.013


In [15]:
0.1103/0.0022

50.13636363636363

the coefficient of p2(operating profile) is negative, which means oeprating profile doe snot have negative impact on stock return.  however, the interaction effect [Inventory turnover]×[Operat ing prof it] has a positive coefficient of 0.0012.  if the inventory turnover exceeds 0.1103/0.0012=50.14 for the retailer, an increase in operating profile has a positive effect on stock retrurn.  However, if the inventory turnover is lower than 50.14, an increase in operating profits has a negative impact effect on stock return.

# Part 2 Bank Credit - Logistic Regression

## Question 1

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [8]:
bank_df = pd.read_csv('https://raw.githubusercontent.com/annwanginnt/predictive-moduling/main/midterm_parttwo.csv')

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder


# One-hot encode the categorical variables
categorical_cols = bank_df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('Credit Rating')  # Remove the target variable from the list

encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(bank_df[categorical_cols])

# Create a DataFrame with the encoded data
encoded_columns = encoder.get_feature_names_out(categorical_cols)
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)

# Concatenate the encoded DataFrame with the original DataFrame (excluding the original categorical columns)
df = pd.concat([bank_df.drop(categorical_cols, axis=1), encoded_df], axis=1)

df



Unnamed: 0,Years of Education after High School,Credit Rating,Requested Credit Amount_High,Requested Credit Amount_Low,Requested Credit Amount_Medium,Number of Dependents_Less than 2,Number of Dependents_More than 2,Number of Dependents_No dependent,Monthly Income_High,Monthly Income_Low,...,Monthly Income_Very High,Monthly Income_Very low,Monthly Expense_High,Monthly Expense_Low,Monthly Expense_Moderate,Monthly Expense_Very high,Monthly Expense_Very low,Marital Status_Married,Marital Status_Not specified,Marital Status_Single
0,1,Positive,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,2,Positive,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1,Positive,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,3,Positive,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,3,Negative,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8076,3,Positive,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8077,3,Negative,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8078,3,Positive,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8079,7,Positive,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [10]:
# Splitting the dataset into training and test sets (50% each)
X = df.drop('Credit Rating', axis=1)
y = df['Credit Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Fitting a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Applying the model to the test set
y_pred = model.predict(X_test)

# Generating a confusion matrix, recall, precision, and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
Report = classification_report(y_test, y_pred)

# Creating a DataFrame for the confusion matrix with labels for better readability
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['Actual Negative', 'Actual Positive'], 
                              columns=['Predicted Negative', 'Predicted Positive'])

print(conf_matrix_df)
print(Report)


                 Predicted Negative  Predicted Positive
Actual Negative                   0                 571
Actual Positive                   0                3470
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       571
    Positive       0.86      1.00      0.92      3470

    accuracy                           0.86      4041
   macro avg       0.43      0.50      0.46      4041
weighted avg       0.74      0.86      0.79      4041



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Question 2

In [12]:
# Convert 'Credit Rating' in y_test from string to numerical format (1 for 'Positive', 0 for 'Negative')
y_test_numerical = y_test.apply(lambda x: 1 if x == 'Positive' else 0)

# Predict probabilities
probabilities = model.predict_proba(X_test)


# Extracting the probabilities of being 'Positive'
positive_probabilities = probabilities[:, 1]

# Finding the threshold for which 15% of the instances are above the threshold
threshold = np.percentile(positive_probabilities, 85)

print(f'Threshold for only 15% granted:{threshold}:\n')


# Updating the predictions based on the new threshold
y_pred_updated = (positive_probabilities >= threshold).astype(int)

# Converting 'Credit Rating' in y_test from string to numerical format
y_test_numerical = y_test.apply(lambda x: 1 if x == 'Positive' else 0)

# Generating the updated confusion matrix and classification report
conf_matrix_updated = confusion_matrix(y_test_numerical, y_pred_updated)
class_report_updated = classification_report(y_test_numerical, y_pred_updated)

conf_matrix_updated = pd.DataFrame(conf_matrix_updated, 
                              index=['Actual Negative', 'Actual Positive'], 
                              columns=['Predicted Negative', 'Predicted Positive'])

# Print the results
print(conf_matrix_updated)
print()
print(class_report_updated)



Threshold for only 15% granted:0.8783077355356468:

                 Predicted Negative  Predicted Positive
Actual Negative                 483                  88
Actual Positive                2845                 625

              precision    recall  f1-score   support

           0       0.15      0.85      0.25       571
           1       0.88      0.18      0.30      3470

    accuracy                           0.27      4041
   macro avg       0.51      0.51      0.27      4041
weighted avg       0.77      0.27      0.29      4041

