# Coal production in mines 2013 

by: Edwin Ayim

Abstract: Results of coal data analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error

sns.set();

## Cleaned Data

Cleaned notebook data stored in deliver/Data_Cleaning.ipnb

In [None]:
df1 = pd.read_csv("../practise/data/cleaned_coalpublic2013.xls.csv",index_col='MSHA ID')

df1[['Year','Mine_Name']].head()

# Predict the production of coal mines

In [None]:
features =   [ 'Average_Employees',
               'Labor_Hours',
             ]


categoricals = [             
                'Mine_State',
                'Mine_County',
                'Mine_Status',
                'Mine_Type',
                'Company_Type',
                'Operation_Type',
                'Union_Code',
                'Coal_Supply_Region',
              ]

target = 'Log_Production'

In [None]:
sns.set_context('poster')
fig = plt.subplots(figsize=(14,8))
sns.violinplot(y="Company_Type", x="Log_Production", data=df1, split=True, inner="stick")
plt.tight_layout()

In [None]:
dummy_categoricals = []
for categorical in categoricals:
    
    # Avoid the dummy variable trap!
    drop_var = sorted(df1[categorical].unique())[-1]
    temp_df1 = pd.get_dummies(df1[categorical], prefix=categorical)
    df1 = pd.concat([df1, temp_df1], axis=1)
    temp_df1.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
    dummy_categoricals += temp_df1.columns.tolist()

# Random Forest Regression

In [None]:
train,test = train_test_split(df1, test_size=0.3)

In [None]:
rf = RandomForestRegressor(n_estimators=100, oob_score=True)
rf.fit(train[features + dummy_categoricals], train[target])

In [None]:
fig = plt.subplots(figsize=(10,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]))
plt.ylabel('Predicted_Production')
plt.xlim(0,22)
plt.ylim(0,22)
plt.tight_layout()
plt.savefig("../practise/deliver_coal_prodution-RF-prediction.png")

In [None]:
predicted = rf.predict(test[features + dummy_categoricals])

print ("R^2 score:" , r2_score(test[target], predicted))

print ("variance score:" , explained_variance_score(test[target], predicted))

print ("MSE:" , mean_squared_error(test[target], predicted))

In [None]:
rf_importances = pd.DataFrame({'name' : train[features + dummy_categoricals].columns,'importance':rf.feature_importances_
                              }).sort_values(by='importance',ascending=False).reset_index(drop=True)

rf_importances.head(5)

# Conclusion

Detailed conclusion.