In [1]:
# import important modules
import numpy as np
import pandas as pd
# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 
from sklearn.metrics import (
    accuracy_score,
    classification_report, 
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# text preprocessing modules
from string import punctuation 
# text preprocessing modules
import nltk
from nltk.tokenize import word_tokenize
# adding two more downloads based on warnings generated when cleaning text 
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression
# Download dependency
for dependency in (
    "brown",
    "names",
    "wordnet",
    "averaged_perceptron_tagger",
    "universal_tagset",
):
    nltk.download(dependency)
    
import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

# import tensorflow and keras for padding 
import tensorflow as tf 

import tensorflow.python.keras as k  # this worked for fixing problems with keras/tensorflow
# import keras as k

from keras_preprocessing.sequence import pad_sequences

from keras_preprocessing.text import Tokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emilyjiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/emilyjiang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /Users/emilyjiang/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     /Users/emilyjiang/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emilyjiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emilyjiang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/emilyjiang/nlt

In [2]:
# load data from phyb_400 csv 
df_original = pd.read_csv("/Users/emilyjiang/Desktop/Git/Phoenix Bioinformatics/IF Code/FinalDatasets/cat1.csv", index_col=0) 

# print beginning of data to check 
df_original.head()

Unnamed: 0,journal_title,pubmed,abstract,Label
0,Cancer science,33211385;,Copy number alterations detected by comparativ...,1.0
1,Revista iberoamericana de micologia,20974273;,Aspergillus fumigatus is an opportunistic path...,1.0
2,Journal of experimental botany,17728292;,Catalase and hydrogen peroxide (H(2)O(2)) have...,1.0
3,"Methods in molecular biology (Clifton, N.J.)",35781205;,The availability of exceptionally strong and t...,1.0
4,PloS one,35639710;,Positive and counter-selectable markers have b...,1.0


In [3]:
# checking shape of dataset 
df_original.shape
# dataset has 8 columns, 400 rows 

(1000, 4)

In [4]:
# check if  dataset has any missing values
df_original.isnull().sum()
df_original = df_original.dropna()
df_original

Unnamed: 0,journal_title,pubmed,abstract,Label
0,Cancer science,33211385;,Copy number alterations detected by comparativ...,1.0
1,Revista iberoamericana de micologia,20974273;,Aspergillus fumigatus is an opportunistic path...,1.0
2,Journal of experimental botany,17728292;,Catalase and hydrogen peroxide (H(2)O(2)) have...,1.0
3,"Methods in molecular biology (Clifton, N.J.)",35781205;,The availability of exceptionally strong and t...,1.0
4,PloS one,35639710;,Positive and counter-selectable markers have b...,1.0
...,...,...,...,...
995,Journal of dental research,17652207;,Amelogenesis imperfecta is an inherited disord...,0.0
996,Autophagy,29261001;,"In macroautophagy/autophagy, cargoes are colle...",0.0
997,Biochimica et biophysica acta. Molecular and c...,30597201;,Protein modification by arginylation regulates...,0.0
998,Proceedings of the National Academy of Science...,32366662;,The Arg/N-degron pathway targets proteins for ...,0.0


In [5]:
# converts to string which is usable
df_original['abstract'] = df_original['abstract'].astype(str)
print(df_original['abstract'])

0      Copy number alterations detected by comparativ...
1      Aspergillus fumigatus is an opportunistic path...
2      Catalase and hydrogen peroxide (H(2)O(2)) have...
3      The availability of exceptionally strong and t...
4      Positive and counter-selectable markers have b...
                             ...                        
995    Amelogenesis imperfecta is an inherited disord...
996    In macroautophagy/autophagy, cargoes are colle...
997    Protein modification by arginylation regulates...
998    The Arg/N-degron pathway targets proteins for ...
999    Protein aggregates are a common feature of neu...
Name: abstract, Length: 979, dtype: object


In [6]:
stop_words =  stopwords.words('english')
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [7]:
# df_original["cleaned_data"] = df_original[df_original.columns[len(df_original.columns)-1]]
# df_original.head()
df_original["cleaned_data"] = df_original['abstract'].apply(text_cleaning)
df_original['str_len'] = df_original['abstract'].apply(len)
df_original

Unnamed: 0,journal_title,pubmed,abstract,Label,cleaned_data,str_len
0,Cancer science,33211385;,Copy number alterations detected by comparativ...,1.0,Copy number alteration detected comparative ge...,1711
1,Revista iberoamericana de micologia,20974273;,Aspergillus fumigatus is an opportunistic path...,1.0,Aspergillus fumigatus opportunistic pathogen c...,3708
2,Journal of experimental botany,17728292;,Catalase and hydrogen peroxide (H(2)O(2)) have...,1.0,Catalase hydrogen peroxide H O extensively stu...,1714
3,"Methods in molecular biology (Clifton, N.J.)",35781205;,The availability of exceptionally strong and t...,1.0,The availability exceptionally strong tightly ...,1327
4,PloS one,35639710;,Positive and counter-selectable markers have b...,1.0,Positive counter selectable marker successfull...,1147
...,...,...,...,...,...,...
995,Journal of dental research,17652207;,Amelogenesis imperfecta is an inherited disord...,0.0,Amelogenesis imperfecta inherited disorder aff...,1061
996,Autophagy,29261001;,"In macroautophagy/autophagy, cargoes are colle...",0.0,In macroautophagy autophagy cargo collected sp...,1419
997,Biochimica et biophysica acta. Molecular and c...,30597201;,Protein modification by arginylation regulates...,0.0,Protein modification arginylation regulates pr...,1713
998,Proceedings of the National Academy of Science...,32366662;,The Arg/N-degron pathway targets proteins for ...,0.0,The Arg N degron pathway target protein degrad...,1921


In [8]:
myTokenizer = Tokenizer(num_words=1000)

In [9]:
# tokenize the sentences (but using keras this time)
df_original.iloc[:,-2]
myTokenizer.fit_on_texts(df_original.iloc[:,-2])
sequences = myTokenizer.texts_to_sequences(df_original.iloc[:,-2])

# padding (make everything the same dimension by adding zeros to the front)
padded = pad_sequences(sequences, maxlen=len(df_original.iloc[:,-2][3].split(" ")))

In [10]:
print(df_original['Label']) 
sum(df_original['Label'])

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
995    0.0
996    0.0
997    0.0
998    0.0
999    0.0
Name: Label, Length: 979, dtype: float64


100.0

In [11]:
# x = padded
x = df_original['cleaned_data']
y = df_original.Label.values 
len(x)

979

In [None]:
# def balanced_subsample(x,y,subsample_size=1.0):

#     class_xs = []
#     min_elems = None

#     for yi in np.unique(y):
#         elems = x[(y == yi)]
#         class_xs.append((yi, elems))
#         if min_elems == None or elems.shape[0] < min_elems:
#             min_elems = elems.shape[0]

#     use_elems = min_elems
#     if subsample_size < 1:
#         use_elems = int(min_elems*subsample_size)

#     xs = []
#     ys = []

#     for ci, this_xs in class_xs:
#         if len(this_xs) > use_elems:
#             np.random.shuffle(this_xs)

#         x_ = this_xs[:use_elems]
#         y_ = np.empty(use_elems)
#         y_.fill(ci)

#         xs.append(x_)
#         ys.append(y_)

#     xs = np.concatenate(xs)
#     ys = np.concatenate(ys)

#     return xs,ys

# x, y = balanced_subsample(x,y)

In [12]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, 
                    test_size=0.15,
                    random_state=42, 
                    shuffle=True,
                    stratify=y,
                    )

In [13]:
phyB_classifier = Pipeline(steps=[
    ('pre_processing',TfidfVectorizer(lowercase=False)),
    ('naive_bayes',MultinomialNB())
])

In [14]:
phyB_classifier.fit(x_train, y_train) 

In [15]:
y_predict = phyB_classifier.predict(x_valid) 

In [16]:
accuracy_score(y_valid, y_predict) 

0.8979591836734694

In [17]:
import joblib
# import it to the webscraping folder
joblib.dump(phyB_classifier, '/Users/emilyjiang/Desktop/Webscraping')

['/Users/emilyjiang/Desktop/Webscraping']