In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np 
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
import prepare as p

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaylabrock/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kaylabrock/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaylabrock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_json('data.json')

In [3]:
df = p.create_df(df)

In [4]:
df.head()

Unnamed: 0,repo,language,original,lemmatized
0,acidanthera/AppleALC,C++,AppleALC\n========\n\n[![Build Status](https:/...,applealc acidanthera applealc workflow ci badg...
1,gongjianhui/AppleDNS,Python,# Final AppleDNS Pro\n\nAppleDNS 通过收集 Apple 在中...,final appledns pro appledns cdn itunes icloud ...
2,tomaz/appledoc,Objective-C,About appledoc\n==============\n\n**IMPORTANT ...,appledoc important notice collaborator needed ...
3,robovm/apple-ios-samples,Objective-C,# Mirror of Apple's iOS samples\n\nThis reposi...,mirror io sample repository mirror io sample l...
4,appleseedhq/appleseed,C++,# appleseed [![Build Status](https://travis-ci...,appleseed travis ci appleseedhq appleseed svg ...


In [5]:
df.shape

(368, 4)

In [6]:
df.language.value_counts()

Swift          76
JavaScript     59
Objective-C    56
Python         45
C              34
Java           30
C++            20
Shell          19
PHP            18
C#             11
Name: language, dtype: int64

In [7]:
df.dtypes

repo          object
language      object
original      object
lemmatized    object
dtype: object

In [8]:
df

Unnamed: 0,repo,language,original,lemmatized
0,acidanthera/AppleALC,C++,AppleALC\n========\n\n[![Build Status](https:/...,applealc acidanthera applealc workflow ci badg...
1,gongjianhui/AppleDNS,Python,# Final AppleDNS Pro\n\nAppleDNS 通过收集 Apple 在中...,final appledns pro appledns cdn itunes icloud ...
2,tomaz/appledoc,Objective-C,About appledoc\n==============\n\n**IMPORTANT ...,appledoc important notice collaborator needed ...
3,robovm/apple-ios-samples,Objective-C,# Mirror of Apple's iOS samples\n\nThis reposi...,mirror io sample repository mirror io sample l...
4,appleseedhq/appleseed,C++,# appleseed [![Build Status](https://travis-ci...,appleseed travis ci appleseedhq appleseed svg ...
...,...,...,...,...
481,PatMurrayDEV/apple-music-history,JavaScript,[![Netlify Status](https://api.netlify.com/api...,netlify api netlify api v1 badge e207d990 1ee1...
482,antongorodezkiy/codeigniter-apns,PHP,"Codeigniter-apns\n(c) 2012, Anton Gorodezkiy\n...",codeigniter apns c 2012 anton gorodezkiy codei...
484,lprhodes/homebridge-apple-tv,JavaScript,# Homebridge Apple TV\n\n## Introduction\nWelc...,homebridge tv welcome tv plugin homebridge nfa...
485,lvsti/CoreMediaIO-DAL-Example,C++,# CoreMediaIO-DAL-Example\n\nModernized and ex...,coremediaio dal example modernized extended co...


In [9]:
df['lemmatized'] = df['lemmatized'].astype('str') 

In [10]:
all_lemmatized = ' '.join(df.lemmatized)
Swift_lemmatized = ' '.join(df[df.language == 'Swift'].lemmatized)
Javascript_lemmatized = ' '.join(df[df.language == 'JavaScript'].lemmatized)
objectiveC_lemmatized = ' '.join(df[df.language == 'Objective-C'].lemmatized)
Python_lemmatized = ' '.join(df[df.language == 'Python'].lemmatized)
C_lemmatized = ' '.join(df[df.language == 'C'].lemmatized)
Java_lemmatized = ' '.join(df[df.language == 'Java'].lemmatized)
Cplusplus_lemmatized = ' '.join(df[df.language == 'C++'].lemmatized)
Shell_lemmatized = ' '.join(df[df.language == 'Shell'].lemmatized)
PHP_lemmatized = ' '.join(df[df.language == 'PHP'].lemmatized)
Cpound_lemmatized = ' '.join(df[df.language == 'C#'].lemmatized)

In [11]:
All_freq = pd.Series(all_lemmatized.split()).value_counts()
Swift_freq = pd.Series(Swift_lemmatized.split()).value_counts()
Javascript_freq = pd.Series(Javascript_lemmatized.split()).value_counts()
ObjectiveC_freq = pd.Series(objectiveC_lemmatized.split()).value_counts()
Python_freq = pd.Series(Python_lemmatized.split()).value_counts()
C_freq = pd.Series(C_lemmatized.split()).value_counts()
Java_freq = pd.Series(Java_lemmatized.split()).value_counts()
Cplusplus_freq = pd.Series(Cplusplus_lemmatized.split()).value_counts()
Shell_freq = pd.Series(Shell_lemmatized.split()).value_counts()
PHP_freq = pd.Series(PHP_lemmatized.split()).value_counts()
Cpound_freq = pd.Series(Cpound_lemmatized.split()).value_counts()

In [12]:
word_counts = pd.concat([All_freq, Swift_freq, Javascript_freq, ObjectiveC_freq, Python_freq, C_freq, Java_freq, Cplusplus_freq, Shell_freq, PHP_freq, Cpound_freq], sort=True, axis=1)
word_counts.columns = ['All', 'Swift', 'Javascript', 'ObjectiveC', 'Python', 'C', 'Java', 'C++', 'Shell', 'PHP', 'C#']
word_counts = word_counts.fillna(0).apply(lambda s: s.astype(int))

In [13]:
word_counts.sort_values(by= 'All', ascending=False).head()

Unnamed: 0,All,Swift,Javascript,ObjectiveC,Python,C,Java,C++,Shell,PHP,C#
io,2368,623,555,535,106,37,68,16,363,28,37
app,1842,407,833,328,49,22,54,4,83,26,36
1,1489,288,394,171,167,143,84,51,144,18,29
2,1325,194,467,135,86,150,87,34,137,17,18
img,1163,367,92,82,33,19,38,22,430,16,64


In [14]:
word_counts.sort_values(by= 'Swift', ascending=False).head()

Unnamed: 0,All,Swift,Javascript,ObjectiveC,Python,C,Java,C++,Shell,PHP,C#
io,2368,623,555,535,106,37,68,16,363,28,37
swift,729,589,4,34,0,0,39,2,61,0,0
swiftui,485,470,1,0,0,0,0,0,14,0,0
app,1842,407,833,328,49,22,54,4,83,26,36
img,1163,367,92,82,33,19,38,22,430,16,64


In [15]:
word_counts.sort_values(by= 'Javascript', ascending=False).head()

Unnamed: 0,All,Swift,Javascript,ObjectiveC,Python,C,Java,C++,Shell,PHP,C#
freeware,869,0,868,0,0,0,0,0,1,0,0
app,1842,407,833,328,49,22,54,4,83,26,36
icon,982,19,831,17,16,6,3,0,85,0,5
yes,834,4,791,34,2,1,0,0,0,1,1
native,931,37,716,79,1,18,42,5,22,2,9


In [None]:
word_counts.sort_values(by= 'ObjectiveC', ascending=False).head(25)

In [None]:
word_counts.sort_values(by= 'Python', ascending=False).head()

In [None]:
word_counts.sort_values(by= 'C', ascending=False).head()

In [None]:
word_counts.sort_values(by= 'Java', ascending=False).head()

In [None]:
word_counts.sort_values(by= 'C++', ascending=False).head()

In [None]:
word_counts.sort_values(by= 'Shell', ascending=False).head()

In [None]:
word_counts.sort_values(by= 'PHP', ascending=False).head()

In [None]:
word_counts.sort_values(by= 'C#', ascending=False).head()

# Bigrams 

In [None]:
df['bigram'] = [list(nltk.bigrams(lemmatized.split())) for lemmatized in df.lemmatized]
df.head()

In [None]:
df.head()

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bag_of_words = cv.fit_transform(df.lemmatized)
bag_of_words

<368x22870 sparse matrix of type '<class 'numpy.int64'>'
	with 80387 stored elements in Compressed Sparse Row format>

In [18]:
#to see what is inside of the sparse matrix
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [19]:
#pprint(news_df.lemmatized)
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names()).head()




Unnamed: 0,00,000,0000,000000,00000002,000035590,0001,00010000,00040000,0008,...,zxystd,zy,zybuluo,zynaptiqs,zypper,zythum,zz20rxc,zzanehip,zzpiglet,zzzzbh
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(df.lemmatized)

#pprint(news_df.lemmatized)
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names()).head()



Unnamed: 0,00,000,0000,000000,00000002,000035590,0001,00010000,00040000,0008,...,zxystd,zy,zybuluo,zynaptiqs,zypper,zythum,zz20rxc,zzanehip,zzpiglet,zzzzbh
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cv = CountVectorizer(ngram_range=(2, 2))
bag_of_words = cv.fit_transform(df.lemmatized)

In [22]:
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names()).head()



Unnamed: 0,00 00,00 0008,00 00z,00 01,00 02,00 03,00 05,00 09,00 10,00 12,...,zy br,zybuluo cmd,zynaptiqs ztx,zypper install,zythum sinaapp,zz20rxc png,zzanehip updated,zzpiglet yichahucha,zzpiglet zzpiglet,zzzzbh plotting
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.language

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12)

In [25]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)

tree.score(X_train, y_train)

0.5204081632653061

In [26]:
# accuracy is the % of times our model predicted correctly
(tree.predict(X_train) == y_train).mean()

0.5204081632653061

In [27]:
tree.score(X_test, y_test)

0.5135135135135135

In [28]:
df.head()

Unnamed: 0,repo,language,original,lemmatized
0,acidanthera/AppleALC,C++,AppleALC\n========\n\n[![Build Status](https:/...,applealc acidanthera applealc workflow ci badg...
1,gongjianhui/AppleDNS,Python,# Final AppleDNS Pro\n\nAppleDNS 通过收集 Apple 在中...,final appledns pro appledns cdn itunes icloud ...
2,tomaz/appledoc,Objective-C,About appledoc\n==============\n\n**IMPORTANT ...,appledoc important notice collaborator needed ...
3,robovm/apple-ios-samples,Objective-C,# Mirror of Apple's iOS samples\n\nThis reposi...,mirror io sample repository mirror io sample l...
4,appleseedhq/appleseed,C++,# appleseed [![Build Status](https://travis-ci...,appleseed travis ci appleseedhq appleseed svg ...


In [29]:
df.language.value_counts()

Swift          76
JavaScript     59
Objective-C    56
Python         45
C              34
Java           30
C++            20
Shell          19
PHP            18
C#             11
Name: language, dtype: int64

In [32]:
rows = ['Python', 'C#', 'PHP', 'Shell', 'C\+\+']
for row in rows:
    df = df[df["language"].str.contains(row) == False]

In [33]:
df.head()

Unnamed: 0,repo,language,original,lemmatized
2,tomaz/appledoc,Objective-C,About appledoc\n==============\n\n**IMPORTANT ...,appledoc important notice collaborator needed ...
3,robovm/apple-ios-samples,Objective-C,# Mirror of Apple's iOS samples\n\nThis reposi...,mirror io sample repository mirror io sample l...
5,JohnCoates/Aerial,Swift,"<p align=""center"">\n <img src=""https://cloud....",align center img src cloud asset 499192 107541...
10,everettjf/AppleTrace,C,# AppleTrace\n\n`AppleTrace` is developed for ...,appletrace appletrace developed analyzing app ...
11,apple/cups,C,README - Apple CUPS v2.3.6 - 2022-05-25\n=====...,readme cup v2 3 6 2022 05 25 note cup cup ship...


In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [42]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

lm = LogisticRegression().fit(X_train, y_train)

train = pd.DataFrame('actual' == y_train)
test = pd.DataFrame('actual' == y_test)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [51]:
print('Accuracy: {:.2%}'.format(accuracy_score(y_train, lm.predict(X_train))))

Accuracy: 95.59%


In [52]:
print('Accuracy: {:.2%}'.format(accuracy_score(y_test, lm.predict(X_test))))

Accuracy: 54.90%
