In [165]:
from __future__ import print_function, division, absolute_import 

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np

### Importing the Plantix Data

In [166]:
raw_data = pd.read_excel(r"C:\Users\Manan Arora\Desktop\Plaksha TLF\Term 2\DataX Project\Agriculture\Datatsets\Plantix_Scrapped_data.xlsx")

In [167]:
raw_data.head()

Unnamed: 0,Name,Link,Symptoms,Trigger,Biological_Control,Chemical_Control,Preventive_Measure
0,Tomato Yellow Leaf Curl Virus,https://plantix.net/plant-disease/en/200036/to...,"When it infects plants at the seedling stage,...",Tomato Yellow Leaf Curl Virus is not seed-born...,"Sorry, we don't know any alternative treatmen...",Always consider an integrated approach with p...,Use resistant or tolerant varieties. Plant e...
1,Bean Fly,https://plantix.net/plant-disease/en/600264/be...,The young leaves show a large number of punct...,Symptoms are caused by the larvae and adults ...,There are several natural enemies of the bean...,Always consider an integrated approach with p...,"If available, plant resistant varieties. Pla..."
2,Potato Scab,https://plantix.net/plant-disease/en/300009/po...,No symptoms are visible on the aerial parts o...,Symptoms are caused by the bacterium Streptom...,"Treatment of potato plants with compost, comp...",Always consider an integrated approach with p...,Plant tolerant varieties. Ensure well-coordi...
3,Helicoverpa Caterpillar,https://plantix.net/plant-disease/en/600289/he...,Whitish to brown eggs can be found in cluster...,The damage is caused by the caterpillar of He...,Trichogramma wasps (T. chilonis or T. brasili...,Always consider an integrated approach with p...,Use resistant or tolerant varieties if availa...
4,Early Blight,https://plantix.net/plant-disease/en/100321/ea...,Symptoms of early blight occur on older folia...,"Symptoms are caused by Alternaria solani, a f...","Small farmers may use algal limestone, a mixt...",Always consider an integrated approach with p...,Use certified pathogen-free seeds or transpla...


In [168]:
len(raw_data)

555

In [169]:
lengths1 = raw_data['Symptoms'].apply(len)
lengths2 = raw_data['Trigger'].apply(len)
print('Average character length of the Symptoms are: ', np.mean(lengths1))
print('Average character length of the Triggers are: ', np.mean(lengths2))

Average character length of the Symptoms are:  668.845045045045
Average character length of the Triggers are:  710.0450450450451


### NLTK Based Data Processing

In [170]:
#nltk.download('averaged_perceptron_tagger')

In [171]:
import bs4 as bs
import nltk
from nltk.tokenize import sent_tokenize # tokenizes sentences
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

eng_stopwords = stopwords.words('english')

#### Splitting Symptoms - Vertical Expansion

In [172]:
labels=[]
symptoms = []
for row in raw_data.iterrows():
    symptom_all = row[1][2].replace('.','. ')
    for sent in sent_tokenize(symptom_all):
        labels+= [row[1][0]]
        symptoms+=[sent]

In [173]:
labels_symptoms = pd.DataFrame(data={'Disease':labels,'Symptoms':symptoms})
labels_symptoms[['Crop','Disease']] = labels_symptoms.Disease.str.split(pat=" ",n=1,expand=True)
labels_symptoms= labels_symptoms[['Crop','Disease','Symptoms']]

In [174]:
labels_symptoms.head()

Unnamed: 0,Crop,Disease,Symptoms
0,Tomato,Yellow Leaf Curl Virus,"When it infects plants at the seedling stage,..."
1,Tomato,Yellow Leaf Curl Virus,"In older plants, the infection results in exce..."
2,Tomato,Yellow Leaf Curl Virus,"At later stages of the disease, they take a le..."
3,Tomato,Yellow Leaf Curl Virus,If the infection takes place before the flower...
4,Bean,Fly,The young leaves show a large number of punct...


#### Splitting Triggers - Vertical Expansion

In [175]:
labels=[]
triggers = []
for row in raw_data.iterrows():
    triggers_all = row[1][3].replace('.','. ')
    for sent in sent_tokenize(triggers_all):
        labels+= [row[1][0]]
        triggers+=[sent]

In [176]:
labels_triggers = pd.DataFrame(data={'Disease':labels,'Triggers':triggers})
labels_triggers[['Crop','Disease']] = labels_triggers.Disease.str.split(pat=" ",n=1,expand=True)
labels_triggers= labels_triggers[['Crop','Disease','Triggers']]

In [177]:
labels_triggers.head()

Unnamed: 0,Crop,Disease,Triggers
0,Tomato,Yellow Leaf Curl Virus,Tomato Yellow Leaf Curl Virus is not seed-born...
1,Tomato,Yellow Leaf Curl Virus,It is spread by whiteflies of the Bemisia taba...
2,Tomato,Yellow Leaf Curl Virus,These whiteflies feed on the lower leaf surfac...
3,Tomato,Yellow Leaf Curl Virus,The whole infection cycle can take place in ab...
4,Bean,Fly,Symptoms are caused by the larvae and adults ...


### Basic Cleaning- Stopwords, Punctuations etc

In [178]:
def cleaner(review):
    '''
    Clean and preprocess a sentence.
    
    1. Remove HTML tags
    2. Use regex to remove all special characters (only keep letters)
    3. Make strings to lower case and tokenize / word split reviews
    4. Remove English stopwords
    5. Rejoin to one string
    '''
    
    #1. Remove HTML tags
    review = bs.BeautifulSoup(review).text
    
    #2. Use regex to find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)
    
    #3. Remove punctuation
    review = re.sub("[^a-zA-Z]", " ",review)
    
    #4. Tokenize into words (all lower case)
    review = review.lower().split()
    
    #5. Remove stopwords
    eng_stopwords = set(stopwords.words("english"))
    review = [w for w in review if not w in eng_stopwords]
    
    #6. Join the review to one sentence
    review = ' '.join(review+emoticons)
    # add emoticons to the end

    return(review)

In [179]:
##Cleaning Symptoms and attaching to base dataframe
num_symptoms = len(labels_symptoms['Symptoms'])

symptoms_clean_original = []

for i in range(0,num_symptoms):
    if( (i+1)%500 == 0 ):
        print("Done with %d symtoms" %(i+1)) 
    symptoms_clean_original.append(cleaner(labels_symptoms['Symptoms'][i]))
labels_symptoms['Cleaned_Symptom'] = symptoms_clean_original

Done with 500 symtoms
Done with 1000 symtoms
Done with 1500 symtoms
Done with 2000 symtoms
Done with 2500 symtoms
Done with 3000 symtoms
Done with 3500 symtoms


In [180]:
##Cleaning Triggers and attaching to base dataframe
num_triggers = len(labels_triggers['Triggers'])

triggers_clean_original = []

for i in range(0,num_triggers):
    if( (i+1)%500 == 0 ):
        print("Done with %d triggers" %(i+1)) 
    triggers_clean_original.append(cleaner(labels_triggers['Triggers'][i]))
labels_triggers['Cleaned_Trigger'] = triggers_clean_original

Done with 500 triggers
Done with 1000 triggers
Done with 1500 triggers
Done with 2000 triggers
Done with 2500 triggers
Done with 3000 triggers
Done with 3500 triggers


#### Stemming 

In [181]:
ps = PorterStemmer()
symptoms_ps = []

for i in range(0,num_symptoms):
    if( (i+1)%500 == 0 ):
        print("Done with %d symptoms" %(i+1)) 
    ps_stems = []
    for w in labels_symptoms['Cleaned_Symptom'][i].split():
        if w == 'oed':
            continue
        ps_stems.append(ps.stem(w))
    
    symptoms_ps.append(' '.join(ps_stems))
labels_symptoms['Stemmed_Symptom'] = symptoms_ps

Done with 500 symptoms
Done with 1000 symptoms
Done with 1500 symptoms
Done with 2000 symptoms
Done with 2500 symptoms
Done with 3000 symptoms
Done with 3500 symptoms


In [182]:
triggers_ps = []

for i in range(0,num_triggers):
    if( (i+1)%500 == 0 ):
        print("Done with %d triggers" %(i+1)) 
    ps_stems = []
    for w in labels_triggers['Cleaned_Trigger'][i].split():
        if w == 'oed':
            continue
        ps_stems.append(ps.stem(w))
    
    triggers_ps.append(' '.join(ps_stems))
labels_triggers['Stemmed_Trigger'] = triggers_ps

Done with 500 triggers
Done with 1000 triggers
Done with 1500 triggers
Done with 2000 triggers
Done with 2500 triggers
Done with 3000 triggers
Done with 3500 triggers


#### Lemetization

In [183]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [184]:
symptoms_wnl = []
wnl = WordNetLemmatizer()

for i in range(0,num_symptoms):
    if( (i+1)%500 == 0 ):
        print("Done with %d symptoms" %(i+1)) 
    wnl_stems = []
    token_tag = pos_tag(labels_symptoms['Cleaned_Symptom'][i].split())
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)
    symptoms_wnl.append(' '.join(wnl_stems))
labels_symptoms['Lematized_Symptom'] = symptoms_wnl

Done with 500 symptoms
Done with 1000 symptoms
Done with 1500 symptoms
Done with 2000 symptoms
Done with 2500 symptoms
Done with 3000 symptoms
Done with 3500 symptoms


In [185]:
triggers_wnl = []

for i in range(0,num_triggers):
    if( (i+1)%500 == 0 ):
        print("Done with %d triggers" %(i+1)) 
    wnl_stems = []
    token_tag = pos_tag(labels_triggers['Cleaned_Trigger'][i].split())
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)
    triggers_wnl.append(' '.join(wnl_stems))
labels_triggers['Lematized_Trigger'] = triggers_wnl

Done with 500 triggers
Done with 1000 triggers
Done with 1500 triggers
Done with 2000 triggers
Done with 2500 triggers
Done with 3000 triggers
Done with 3500 triggers


### Data Check

In [186]:
labels_symptoms.head()

Unnamed: 0,Crop,Disease,Symptoms,Cleaned_Symptom,Stemmed_Symptom,Lematized_Symptom
0,Tomato,Yellow Leaf Curl Virus,"When it infects plants at the seedling stage,...",infects plants seedling stage tomato yellow le...,infect plant seedl stage tomato yellow leaf cu...,infects plant seedling stage tomato yellow lea...
1,Tomato,Yellow Leaf Curl Virus,"In older plants, the infection results in exce...",older plants infection results excessive branc...,older plant infect result excess branch thicke...,old plant infection result excessive branch th...
2,Tomato,Yellow Leaf Curl Virus,"At later stages of the disease, they take a le...",later stages disease take leathery texture chl...,later stage diseas take leatheri textur chloro...,later stag disease take leathery texture chlor...
3,Tomato,Yellow Leaf Curl Virus,If the infection takes place before the flower...,infection takes place flowering stage number f...,infect take place flower stage number fruit co...,infection take place flower stage number fruit...
4,Bean,Fly,The young leaves show a large number of punct...,young leaves show large number punctures light...,young leav show larg number punctur light yell...,young leave show large number puncture light y...


In [187]:
labels_triggers.head()

Unnamed: 0,Crop,Disease,Triggers,Cleaned_Trigger,Stemmed_Trigger,Lematized_Trigger
0,Tomato,Yellow Leaf Curl Virus,Tomato Yellow Leaf Curl Virus is not seed-born...,tomato yellow leaf curl virus seed borne trans...,tomato yellow leaf curl viru seed born transmi...,tomato yellow leaf curl virus seed borne trans...
1,Tomato,Yellow Leaf Curl Virus,It is spread by whiteflies of the Bemisia taba...,spread whiteflies bemisia tabaci species,spread whitefli bemisia tabaci speci,spread whitefly bemisia tabaci specie
2,Tomato,Yellow Leaf Curl Virus,These whiteflies feed on the lower leaf surfac...,whiteflies feed lower leaf surface number plan...,whitefli feed lower leaf surfac number plant a...,whitefly fee low leaf surface number plant att...
3,Tomato,Yellow Leaf Curl Virus,The whole infection cycle can take place in ab...,whole infection cycle take place hours favored...,whole infect cycl take place hour favor dri we...,whole infection cycle take place hour favor dr...
4,Bean,Fly,Symptoms are caused by the larvae and adults ...,symptoms caused larvae adults bean fly ophiomy...,symptom caus larva adult bean fli ophiomyia ph...,symptom cause larvae adult bean fly ophiomyia ...


### Gensim

In [189]:
import os
import gensim
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [190]:
lee_train_file

'C:\\Users\\Manan Arora\\Anaconda3\\lib\\site-packages\\gensim\\test\\test_data\\lee_background.cor'

In [193]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [221]:
symptoms_list[0]

'infects plants seedling stage tomato yellow leaf curl virus causes severe stunting young leaves shoots resulting somewhat bushy growth plant'

In [226]:
train_list = []
symptoms_list = labels_symptoms['Cleaned_Symptom']
for i in range(0,len(symptoms_list)):
    tokens = gensim.utils.simple_preprocess(symptoms_list[i])
    train_list+= [gensim.models.doc2vec.TaggedDocument(tokens, [i])]

In [227]:
train_list[:5]

[TaggedDocument(words=['infects', 'plants', 'seedling', 'stage', 'tomato', 'yellow', 'leaf', 'curl', 'virus', 'causes', 'severe', 'stunting', 'young', 'leaves', 'shoots', 'resulting', 'somewhat', 'bushy', 'growth', 'plant'], tags=[0]),
 TaggedDocument(words=['older', 'plants', 'infection', 'results', 'excessive', 'branching', 'thicker', 'wrinkled', 'leaves', 'interveinal', 'chlorosis', 'clearly', 'visible', 'blade'], tags=[1]),
 TaggedDocument(words=['later', 'stages', 'disease', 'take', 'leathery', 'texture', 'chlorotic', 'margins', 'rolled', 'upwards', 'inwards'], tags=[2]),
 TaggedDocument(words=['infection', 'takes', 'place', 'flowering', 'stage', 'number', 'fruits', 'considerably', 'reduced', 'even', 'though', 'noticeable', 'symptoms', 'surface'], tags=[3]),
 TaggedDocument(words=['young', 'leaves', 'show', 'large', 'number', 'punctures', 'light', 'yellow', 'spots', 'upper', 'side', 'especially', 'bottom', 'leaf'], tags=[4])]

In [328]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=1, epochs=1000)

In [329]:
model.build_vocab(train_list)

In [330]:
model.train(train_list, total_examples=model.corpus_count, epochs=model.epochs)

In [331]:
new_sentence = "gummy substance usually exudes cankers".split(" ")  
model.docvecs.most_similar(positive=[model.infer_vector(new_sentence)],topn=5)

[(919, 0.5887357592582703),
 (1047, 0.5835825204849243),
 (2690, 0.582670271396637),
 (3713, 0.5786845684051514),
 (244, 0.5720994472503662)]

In [325]:
symptoms_list[2436]

'winter cankers similar usually softer moister sunken sour smell'

In [None]:
np.mean((X.dot(theta) - x_prime)**2)