### Prepare

In [1]:
import pandas as pd
import json
from pandas import json_normalize

import unicodedata
import re

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

### Read .json file to DataFrame

In [2]:
df1 = pd.read_json(r'data.json')
df1

Unnamed: 0,repo,language,readme_contents
0,acidanthera/AppleALC,C++,AppleALC\n========\n\n[![Build Status](https:/...
1,gongjianhui/AppleDNS,Python,# Final AppleDNS Pro\n\nAppleDNS 通过收集 Apple 在中...
2,tomaz/appledoc,Objective-C,About appledoc\n==============\n\n**IMPORTANT ...
3,robovm/apple-ios-samples,Objective-C,# Mirror of Apple's iOS samples\n\nThis reposi...
4,appleseedhq/appleseed,C++,# appleseed [![Build Status](https://travis-ci...
...,...,...,...
485,lvsti/CoreMediaIO-DAL-Example,C++,# CoreMediaIO-DAL-Example\n\nModernized and ex...
486,xamarin/apple-api-docs,F#,# Xamarin Apple API docs\n\nThis repository co...
487,realtnt/AppleID-Verification-AppleScirpt,AppleScript,Apple ID Verification\n---------------------\n...
488,learn-co-students/apples-and-holidays-online-w...,Ruby,# Iterating Over Hashes\n\n## Objectives\n\n1....


#### Identify Dtypes

In [3]:
df1.dtypes

repo               object
language           object
readme_contents    object
dtype: object

In [4]:
string = df1.readme_contents

In [5]:
string[0]



In [6]:
# creating the function
def basic_clean(string):
    
    # lowercase everything
    string = string.lower()
    
    # remove inconsistenceis
    # encode into ascii byte strings
    # decode back into UTF-8
    # (This process will normalize the unicode characters)
    
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('UTF-8')
    
    # replace anything that is not a letter, number, whitespace, etc
    # use regex to perform this operation
    string = re.sub(r"[^a-z0-9\s]", ' ', string)
    
    return string

In [7]:
cleaned = basic_clean(string[0])
cleaned

'applealc\n        \n\n   build status  https   github com acidanthera applealc workflows ci badge svg branch master   https   github com acidanthera applealc actions     scan status  https   scan coverity com projects 16166 badge svg flat 1   https   scan coverity com projects 16166 \n\nan open source kernel extension enabling native macos hd audio for not officially supported codecs without any filesystem modifications  applealcu can be used for systems with digital only audio \n\nenglish  current   \n   https   github com acidanthera applealc blob master readme cn md   \n\n     features\n  digital and analog audio support starting from the os installation\n  recovery hd macos installer audio support\n  automated codec detection\n  unsupported audio controller enabling  internal and external \n  arbitrary kext patching\n  custom platform layout injection\n  works with sip   el capitan \n  currently compatible with 10 4 13\n\n     credits\n   apple  https   www apple com  for macos  \

In [8]:
def tokenize(string):
    """
    This function will take in a string, tokenize the string and 
    return the tokenize string
    """
    
    #create the token
    token = nltk.tokenize.ToktokTokenizer()
    
    #Use the token
    string = token.tokenize(string,  return_str=True)
    
    return string


#### tokenize strings

In [9]:
token = tokenize(cleaned)
token

'applealc\n \n\n build status https github com acidanthera applealc workflows ci badge svg branch master https github com acidanthera applealc actions scan status https scan coverity com projects 16166 badge svg flat 1 https scan coverity com projects 16166 \n\nan open source kernel extension enabling native macos hd audio for not officially supported codecs without any filesystem modifications applealcu can be used for systems with digital only audio \n\nenglish current \n https github com acidanthera applealc blob master readme cn md \n\n features\n digital and analog audio support starting from the os installation\n recovery hd macos installer audio support\n automated codec detection\n unsupported audio controller enabling internal and external \n arbitrary kext patching\n custom platform layout injection\n works with sip el capitan \n currently compatible with 10 4 13\n\n credits\n apple https www apple com for macos \n onyx the black cat https github com gdbinit onyx the black ca

In [10]:
# download wornet lemmatized
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/deangelobowen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def lemmatize(string):
    """This function takes in a string and returns a lmeeatized 
    version of the string"""
    
    # create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    string_lemmatize = ' '.join(lemmas)
    
    return string_lemmatize

#### lemmatize strings

In [12]:
lemmatized = lemmatize(token)
lemmatized

'applealc build status http github com acidanthera applealc workflow ci badge svg branch master http github com acidanthera applealc action scan status http scan coverity com project 16166 badge svg flat 1 http scan coverity com project 16166 an open source kernel extension enabling native macos hd audio for not officially supported codecs without any filesystem modification applealcu can be used for system with digital only audio english current http github com acidanthera applealc blob master readme cn md feature digital and analog audio support starting from the o installation recovery hd macos installer audio support automated codec detection unsupported audio controller enabling internal and external arbitrary kext patching custom platform layout injection work with sip el capitan currently compatible with 10 4 13 credit apple http www apple com for macos onyx the black cat http github com gdbinit onyx the black cat by fg http reverse put a for the base of the kernel patcher capst

In [13]:
def advanced_clean(df):
    string = df.readme_contents
    
    lis = []
    i = 0
    while i <= 489:
        cleaned = basic_clean(string[i])
        token = tokenize(cleaned)
        lemmatized = lemmatize(token)
        lis.append(lemmatized)
        i+=1
    return lis        

In [14]:
cleaned_df = advanced_clean(df1)

In [15]:
df2 = pd.DataFrame(cleaned_df)

In [16]:
df2

Unnamed: 0,0
0,applealc build status http github com acidanth...
1,final appledns pro appledns apple cdn itunes i...
2,about appledoc important notice collaborator n...
3,mirror of apple s io sample this repository mi...
4,appleseed build status http travis ci org appl...
...,...
485,coremediaio dal example modernized and extende...
486,xamarin apple api doc this repository contains...
487,apple id verification purpose feature after cr...
488,iterating over hash objective 1 iterate over n...


#### join dataframes

In [17]:
df = pd.concat([df1, df2], axis=1, join="inner")

In [18]:
df

Unnamed: 0,repo,language,readme_contents,0
0,acidanthera/AppleALC,C++,AppleALC\n========\n\n[![Build Status](https:/...,applealc build status http github com acidanth...
1,gongjianhui/AppleDNS,Python,# Final AppleDNS Pro\n\nAppleDNS 通过收集 Apple 在中...,final appledns pro appledns apple cdn itunes i...
2,tomaz/appledoc,Objective-C,About appledoc\n==============\n\n**IMPORTANT ...,about appledoc important notice collaborator n...
3,robovm/apple-ios-samples,Objective-C,# Mirror of Apple's iOS samples\n\nThis reposi...,mirror of apple s io sample this repository mi...
4,appleseedhq/appleseed,C++,# appleseed [![Build Status](https://travis-ci...,appleseed build status http travis ci org appl...
...,...,...,...,...
485,lvsti/CoreMediaIO-DAL-Example,C++,# CoreMediaIO-DAL-Example\n\nModernized and ex...,coremediaio dal example modernized and extende...
486,xamarin/apple-api-docs,F#,# Xamarin Apple API docs\n\nThis repository co...,xamarin apple api doc this repository contains...
487,realtnt/AppleID-Verification-AppleScirpt,AppleScript,Apple ID Verification\n---------------------\n...,apple id verification purpose feature after cr...
488,learn-co-students/apples-and-holidays-online-w...,Ruby,# Iterating Over Hashes\n\n## Objectives\n\n1....,iterating over hash objective 1 iterate over n...


### create function to recreate dataframe

In [112]:
def create_df(df):
    cleaned_df = advanced_clean(df)
    df1 = pd.DataFrame(cleaned_df)
    df2 = pd.concat([df, df1], axis=1, join="inner")
    
    #drop nulls
    df2 = df2.dropna()
    
    #rename columns
    df2 = df2.rename(columns={'readme_contents':'original', 0:'lemmatized'})
    
    #identify low sample size languages
    rows = ['AppleScript', 'TypeScript', 'Go','HTML', 'QML' , 'CSS', 'Dart', 'Vue', 'Starlark', 'Assembly', 'Kotlin',
        'Makefile', 'Perl','Zig', 'Eagle' , 'Dockerfile', 'CMake', 'Julia', 'ASL', 'CoffeeScript', 'Erlang',
    'Rich Text Format', 'ActionScript', 'VHDL' , 'Verilog', 'Objective-C\+\+', 'Matlab', 'R', 'ASP.NET', 'F#']
    
    # drop low sample sized languages
    for row in rows:
        df2 = df2[df2["language"].str.contains(row) == False]
    
    return df2

In [113]:
df = create_df(df1)

In [114]:
df

Unnamed: 0,repo,language,original,lemmatized
0,acidanthera/AppleALC,C++,AppleALC\n========\n\n[![Build Status](https:/...,applealc build status http github com acidanth...
1,gongjianhui/AppleDNS,Python,# Final AppleDNS Pro\n\nAppleDNS 通过收集 Apple 在中...,final appledns pro appledns apple cdn itunes i...
2,tomaz/appledoc,Objective-C,About appledoc\n==============\n\n**IMPORTANT ...,about appledoc important notice collaborator n...
3,robovm/apple-ios-samples,Objective-C,# Mirror of Apple's iOS samples\n\nThis reposi...,mirror of apple s io sample this repository mi...
4,appleseedhq/appleseed,C++,# appleseed [![Build Status](https://travis-ci...,appleseed build status http travis ci org appl...
...,...,...,...,...
481,PatMurrayDEV/apple-music-history,JavaScript,[![Netlify Status](https://api.netlify.com/api...,netlify status http api netlify com api v1 bad...
482,antongorodezkiy/codeigniter-apns,PHP,"Codeigniter-apns\n(c) 2012, Anton Gorodezkiy\n...",codeigniter apns c 2012 anton gorodezkiy codei...
484,lprhodes/homebridge-apple-tv,JavaScript,# Homebridge Apple TV\n\n## Introduction\nWelc...,homebridge apple tv introduction welcome to th...
485,lvsti/CoreMediaIO-DAL-Example,C++,# CoreMediaIO-DAL-Example\n\nModernized and ex...,coremediaio dal example modernized and extende...


In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 368 entries, 0 to 489
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   repo        368 non-null    object
 1   language    368 non-null    object
 2   original    368 non-null    object
 3   lemmatized  368 non-null    object
dtypes: object(4)
memory usage: 14.4+ KB


### To prepare this dataframe:

#### Started with 490 values
- dropped null-identified languages
- Removed small sample sized languages identified such as, AppleScript, TypeScript, GO, HTML, CSS, etc.
- lemmatized the readme samples

##### insert into prepare.py file for recreation