In [1]:
# DummyClassifier from sklearn is used to create a baseline accuracy.
from sklearn.dummy import DummyClassifier 
# TfidfVectorizer is used to create a sparce matrices of tf-idf scores to run through modeling
from sklearn.feature_extraction.text import TfidfVectorizer
# Import to split the data into train, test, split
from sklearn.model_selection import train_test_split
# Import to model the data
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
# Import to understand the data modeling scores
from sklearn.metrics import classification_report, accuracy_score
import acquire
import prepare
import pandas as pd
rs = 123



In [2]:
df = acquire.get_github_data()

In [3]:
# prepare the data by creating clean, stemmed and lemmatized columns
df = prepare.prep_github_data(df, column='readme_contents')

In [4]:
train, validate, test = prepare.split_github_data(df)

train---> (89, 6)
validate---> (39, 6)
test---> (32, 6)


In [5]:
# Creates a list of top three languages from dataframe
top_languages = ['JavaScript','Python','TypeScript']
# Create a new version of our df to transform for modeling
model = df.copy()
# Rename the languages that are not in the top three "not_top"
model['language'] = model.language.apply(lambda lang : lang if lang in top_languages else "not_top")
# show the distribution
model.language.value_counts(normalize=True)

JavaScript    0.42500
Python        0.36250
not_top       0.14375
TypeScript    0.06875
Name: language, dtype: float64

In [6]:
# baseline prediction
print(f'Baseline Accuracy: {round(max(train.language.value_counts()) / train.shape[0] *100)}%')

Baseline Accuracy: 43%


In [7]:
# Create the tf-idf model
tfidf = TfidfVectorizer()
# Fit the model and create the X, y variables for modeling
X = tfidf.fit_transform(model.lemmatized)
y = model.language 
# Split the data into train (55%) validate (24%) test (20%) split
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state = rs)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, stratify=y_train_validate, test_size=.3, random_state = rs)

In [8]:
# Create Result Dataframes to store actual and predictive scores from the models
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [9]:
# Create and fit each model
lm = LogisticRegression(multi_class='multinomial',random_state=rs).fit(X_train, y_train)
dtc = DecisionTreeClassifier(max_depth=4, random_state=rs).fit(X_train, y_train)
rf = RandomForestClassifier(min_samples_leaf=3,max_depth=4, random_state=rs).fit(X_train,y_train)
knn = KNeighborsClassifier().fit(X_train,y_train)

In [10]:
# Store the train predictions in our result df
train['lm_predicted'] = lm.predict(X_train)
train['dtc_predicted'] = dtc.predict(X_train)
train['rf_predicted'] = rf.predict(X_train)
train['knn_predicted'] = knn.predict(X_train)

In [11]:
print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.lm_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.lm_predicted))

Logistic Regression Accuracy: 82.02%
---
Confusion Matrix
actual        JavaScript  Python  TypeScript  not_top
lm_predicted                                         
JavaScript            38       1           6        8
Python                 0      31           0        1
not_top                0       0           0        4
---
              precision    recall  f1-score   support

  JavaScript       0.72      1.00      0.84        38
      Python       0.97      0.97      0.97        32
  TypeScript       0.00      0.00      0.00         6
     not_top       1.00      0.31      0.47        13

    accuracy                           0.82        89
   macro avg       0.67      0.57      0.57        89
weighted avg       0.80      0.82      0.77        89



In [12]:
print('Decision Tree Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.dtc_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.dtc_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.dtc_predicted))

Decision Tree Accuracy: 76.40%
---
Confusion Matrix
actual         JavaScript  Python  TypeScript  not_top
dtc_predicted                                         
JavaScript             38       8           6        4
Python                  0      24           0        3
not_top                 0       0           0        6
---
              precision    recall  f1-score   support

  JavaScript       0.68      1.00      0.81        38
      Python       0.89      0.75      0.81        32
  TypeScript       0.00      0.00      0.00         6
     not_top       1.00      0.46      0.63        13

    accuracy                           0.76        89
   macro avg       0.64      0.55      0.56        89
weighted avg       0.76      0.76      0.73        89



In [13]:
print('Random Forest Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

Random Forest Accuracy: 71.91%
---
Confusion Matrix
actual        JavaScript  Python  TypeScript  not_top
rf_predicted                                         
JavaScript            38       8           5       10
Python                 0      24           0        2
TypeScript             0       0           1        0
not_top                0       0           0        1
---
              precision    recall  f1-score   support

  JavaScript       0.62      1.00      0.77        38
      Python       0.92      0.75      0.83        32
  TypeScript       1.00      0.17      0.29         6
     not_top       1.00      0.08      0.14        13

    accuracy                           0.72        89
   macro avg       0.89      0.50      0.51        89
weighted avg       0.81      0.72      0.67        89



In [14]:
print('KNN Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.knn_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.knn_predicted))

KNN Accuracy: 69.66%
---
Confusion Matrix
actual         JavaScript  Python  TypeScript  not_top
knn_predicted                                         
JavaScript             32      10           3        5
Python                  4      22           0        2
TypeScript              1       0           3        1
not_top                 1       0           0        5
---
              precision    recall  f1-score   support

  JavaScript       0.64      0.84      0.73        38
      Python       0.79      0.69      0.73        32
  TypeScript       0.60      0.50      0.55         6
     not_top       0.83      0.38      0.53        13

    accuracy                           0.70        89
   macro avg       0.71      0.60      0.63        89
weighted avg       0.72      0.70      0.69        89



In [15]:
# Assign the validate predictions to the results df
validate['lm_predicted'] = lm.predict(X_validate)
validate['dtc_predicted'] = dtc.predict(X_validate)
validate['rf_predicted'] = rf.predict(X_validate)
validate['knn_predicted'] = knn.predict(X_validate)

In [16]:
print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.lm_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.lm_predicted))

Logistic Regression Accuracy: 58.97%
---
Confusion Matrix
actual        JavaScript  Python  TypeScript  not_top
lm_predicted                                         
JavaScript            16       7           3        4
Python                 1       7           0        1
---
              precision    recall  f1-score   support

  JavaScript       0.53      0.94      0.68        17
      Python       0.78      0.50      0.61        14
  TypeScript       0.00      0.00      0.00         3
     not_top       0.00      0.00      0.00         5

    accuracy                           0.59        39
   macro avg       0.33      0.36      0.32        39
weighted avg       0.51      0.59      0.52        39



In [17]:
print('Decision Tree Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.dtc_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.dtc_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.dtc_predicted))

Decision Tree Accuracy: 56.41%
---
Confusion Matrix
actual         JavaScript  Python  TypeScript  not_top
dtc_predicted                                         
JavaScript             16       7           1        4
Python                  0       5           1        0
not_top                 1       2           1        1
---
              precision    recall  f1-score   support

  JavaScript       0.57      0.94      0.71        17
      Python       0.83      0.36      0.50        14
  TypeScript       0.00      0.00      0.00         3
     not_top       0.20      0.20      0.20         5

    accuracy                           0.56        39
   macro avg       0.40      0.37      0.35        39
weighted avg       0.57      0.56      0.52        39



In [18]:
print('Random Forest Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.rf_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.rf_predicted))

Random Forest Accuracy: 56.41%
---
Confusion Matrix
actual        JavaScript  Python  TypeScript  not_top
rf_predicted                                         
JavaScript            17       9           3        4
Python                 0       5           0        1
---
              precision    recall  f1-score   support

  JavaScript       0.52      1.00      0.68        17
      Python       0.83      0.36      0.50        14
  TypeScript       0.00      0.00      0.00         3
     not_top       0.00      0.00      0.00         5

    accuracy                           0.56        39
   macro avg       0.34      0.34      0.29        39
weighted avg       0.52      0.56      0.48        39



In [20]:
print('KNN Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.knn_predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.knn_predicted))

KNN Accuracy: 69.23%
---
Confusion Matrix
actual         JavaScript  Python  TypeScript  not_top
knn_predicted                                         
JavaScript             14       4           1        1
Python                  2      10           1        1
not_top                 1       0           1        3
---
              precision    recall  f1-score   support

  JavaScript       0.70      0.82      0.76        17
      Python       0.71      0.71      0.71        14
  TypeScript       0.00      0.00      0.00         3
     not_top       0.60      0.60      0.60         5

    accuracy                           0.69        39
   macro avg       0.50      0.53      0.52        39
weighted avg       0.64      0.69      0.66        39



In [22]:
# Assign the predicitons to the results df
test['lm_predicted'] = lm.predict(X_test)

In [23]:
print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.lm_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.lm_predicted))

Logistic Regression Accuracy: 50.00%
---
Confusion Matrix
actual        JavaScript  Python  TypeScript  not_top
lm_predicted                                         
JavaScript            11       7           1        5
Python                 2       5           1        0
---
              precision    recall  f1-score   support

  JavaScript       0.46      0.85      0.59        13
      Python       0.62      0.42      0.50        12
  TypeScript       0.00      0.00      0.00         2
     not_top       0.00      0.00      0.00         5

    accuracy                           0.50        32
   macro avg       0.27      0.32      0.27        32
weighted avg       0.42      0.50      0.43        32

