In [None]:
!pip install transformers
from transformers import pipeline, set_seed
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip3 install -q -U bitsandbytes
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

!pip3 install -q -U datasets==2.17.0

In [None]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

In [None]:
summarizer = pipeline("summarization")

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/kaggle/input/farmers-call-query-data-qa/questionsv4.csv', delimiter=',')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

## Data cleaning

In [None]:
#change character lower case
data['questions']=data['questions'].str.lower()

In [None]:
#drop NULL
data.dropna(inplace=True)

In [None]:
data =data.drop_duplicates()

### Tokenization

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
import string
string.punctuation

In [None]:
def remove_punct(text):
    translator = str.maketrans('', '', string.punctuation)
    result=text.translate(translator)
    return result

In [None]:
data['questions']=data.questions.apply(remove_punct)

In [None]:
data.head()

### Remove Stop Words

In [None]:
from nltk.corpus import stopwords

stop=set(stopwords.words('english'))

In [None]:
def remove_stopword(text):
    word_tokens = word_tokenize(text)
    result = [w for w in word_tokens if not w.lower() in stop]
    return ' '.join(result)

In [None]:
data['questions']=data.questions.apply(remove_stopword)

In [None]:
data.questions

### Lemmetizing

In [None]:
# nltk.download()

In [None]:
import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize("kids")

In [None]:
def lemm(text):
    list2 = nltk.word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(words) for words in list2])


In [None]:
data.questions=data.questions.apply(lemm)

## Save and load dataframe as a pkl file

In [None]:
# data.to_pickle("data.pkl")

In [None]:
# df = pd.read_pickle("/kaggle/working/data.pkl")

### Vectoriser

In [None]:
corpus=data['questions'].values
corpus

### Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bw_vect = CountVectorizer()
# tokenize et construire le vocabulaire
bw_fit=bw_vect.fit(corpus)
# vectoriser les mots
bw_corpus = bw_fit.transform(corpus)

In [None]:
bw_corpus.shape

In [None]:
bw_fit.get_feature_names_out()

In [None]:
bw_corpus.toarray()

In [None]:
cv_data=pd.DataFrame(bw_corpus.toarray(),columns=bw_fit.get_feature_names_out())
cv_data

### TF-IDF

This is a statistical measure used to assess the importance of a term within a document, relative to a collection or corpus. The weight increases proportionally with the number of occurrences of the word in the document. It also varies based on the frequency of the word in the corpus.

- TF : Term Frequency describes how often a certain term appears in a document compared to all other terms in the document.
$$TF(m,p)=\frac{f_{m,p}}{f_p}$$
$f_{m,p}$: frequency of the word $m$ in the sentence $p$, $f_p$  number of words in the sentence  $p$

- IDF : IDF measures the significance of a term not in relation to its frequency in a particular document, but in relation to its distribution and usage across all documents.
$$IDF(m)=\log (\frac{L}{L_m})$$
$L$ : number of sentences in the corpus, $L_m$ :  number of sentences in the corpus where the word $m$ appears

- TF-IDF : This is the multiplication of the two values. Since Term Frequency represents the relevance of a term in a given document and Inverse Document Frequency can reflect the role of a term in relation to all the documents in a corpus, the combination of the two values helps in understanding the actual frequency of terms and the potential of each term.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_vect = TfidfVectorizer(max_features=5000)

tfidf_fit=tf_vect.fit(corpus)

tfidf_corpus= tfidf_fit.transform(corpus)

## Load and Save the vectorizer as pkl

In [None]:
corpus = np.array(["aaa bbb ccc", "aaa bbb ddd"])
vectorizer = CountVectorizer(decode_error="replace")
vec_train = vectorizer.fit_transform(corpus)
#Save vectorizer.vocabulary_
pickle.dump(vectorizer.vocabulary_,open("feature.pkl","wb"))

#Load it later
transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb")))
tfidf = transformer.fit_transform(loaded_vec.fit_transform(np.array(["aaa ccc eee"])))

In [None]:
tfidf_fit.get_feature_names_out()

In [None]:
tfidf_data=pd.DataFrame(tfidf_corpus.toarray(),columns=tfidf_fit.get_feature_names_out())
tfidf_data

In [None]:
test=' how to avail kisan credit card loan for sali crop.'
tfidf_test=tfidf_fit.transform([test])

In [None]:
tfidf_test.shape

In [None]:
mask=tfidf_test.toarray()!=0
m=mask[0]
m

In [None]:
tfidf_test.toarray()[mask]

In [None]:
tfidf_data.columns[m]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cm=cosine_similarity(tfidf_test, tfidf_corpus)
cm[0]

In [None]:
import numpy as np
pos=np.argmax(cm[0])
data.iloc[pos]

In [None]:
data.answers[pos]

### Function response



In [None]:
import numpy as np
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import random

In [None]:
lemm = WordNetLemmatizer()

def clean_data(corpus):
    result = []
    for sentence in corpus:
        # Removing parenthesis from sentence
        sentence = re.sub(r"[\([{})\]]"," " , sentence)
        # Convert to list of words
        sentence = sentence.split()
        # Lemmatize each word
        sentence = [lemm.lemmatize(word) for word in sentence]
        # Form string from list and append to result
        result.append(" ".join(sentence))
    return result

### Summarizer

In [None]:
text = """What is the optimal planting time for tomatoes in our region, considering the current weather conditions and soil moisture levels? Additionally, which tomato varieties have shown the best adaptability to our specific microclimate and soil composition in recent years?"""
summary=summarizer(text, max_length=40, min_length=10, do_sample=False)[0]
print("\n",summary['summary_text'])

In [None]:
def summarize_input(test):
    summary=summarizer(text, max_length=20, min_length=10, do_sample=False)[0]
    return summary['summary_text']

In [None]:
testtt = "What is the optimal planting time for tomatoes in our region, considering the current weather conditions and soil moisture levels? Additionally, which tomato varieties have shown the best adaptability to our specific microclimate and soil composition in recent years?"
if len(nltk.word_tokenize(testtt)) >2:
    print(summarize_input(testtt))

In [None]:
# tf_vect = TfidfVectorizer(max_features=5000)

# tfidf_fit=tf_vect.fit(corpus)

# tfidf_corpus= tfidf_fit.transform(corpus)

welcome = ['hi', 'hey']

def chatbot_response(test):
    # If user inputs any of the greeting words, give greeting in response
    for w in test.split():
        if w.lower() in welcome:
            return random.choice(welcome)
    if len(nltk.word_tokenize(test)) >20:
        test=summarize_input(test)
    tfidf_test=tfidf_fit.transform([test])
    mask=tfidf_test.toarray()!=0
    m=mask[0]
    tfidf_test.toarray()[mask]
    cm=cosine_similarity(tfidf_test, tfidf_corpus)
    pos=np.argmax(cm[0])
    data.iloc[pos]
    return data.answers[pos]


### Chatbot

In [None]:
# # Remove comments to run chatbot
# print("FARM BOT: Hi!! Type bye to exit. Ask me anything: ")
# while(True):
#     user_chat = input()
#     if(user_chat.lower()=="bye"):
#         print("Bye")
#         break
#     print("Farm BOT: ", end=" ")
#     print(chatbot_response(user_chat))

### Eng-Hindi chat

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

In [None]:
model = MBartForConditionalGeneration.from_pretrained("SnypzZz/Llama2-13b-Language-translate")


In [None]:
tokenizer = MBart50TokenizerFast.from_pretrained("SnypzZz/Llama2-13b-Language-translate", src_lang="en_XX")



In [None]:
article_en = "FARM BOT: Hi!! Type bye to exit. Ask me anything"

In [None]:
model_inputs = tokenizer(article_en, return_tensors="pt")


In [None]:

# translate from English to Hindi
generated_tokens = model.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)

# => 'संयुक्त राष्ट्र के नेता कहते हैं कि सीरिया में कोई सैन्य समाधान नहीं है'

In [None]:
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [None]:
def toHindi(en):
    model_inputs = tokenizer(en, return_tensors="pt")
    generated_tokens = model.generate(**model_inputs,forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"])
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [None]:
welcome = ['hi', 'hey']

def chatbot_response_hindi(test):
    # If user inputs any of the greeting words, give greeting in response
    for w in test.split():
        if w.lower() in welcome:
            return random.choice(welcome)
    if len(nltk.word_tokenize(test)) >20:
        test=summarize_input(test)
    tfidf_test=tfidf_fit.transform([test])
    mask=tfidf_test.toarray()!=0
    m=mask[0]
    tfidf_test.toarray()[mask]
    cm=cosine_similarity(tfidf_test, tfidf_corpus)
    pos=np.argmax(cm[0])
    data.iloc[pos]
    return toHindi(data.answers[pos])

In [None]:
# # Remove comments to run chatbot
# print("FARM BOT: हाय!! बाहर निकलने के लिए बाई टाइप करें. मुझे कुछ भी पूछें: ")
# while(True):
#     user_chat = input()
#     if(user_chat.lower()=="bye"):
#         print("Bye")
#         break
#     print("Farm BOT: ", end=" ")
#     print(chatbot_response_hindi(user_chat))

### Hin-Hin Chat

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# **Quantized model(my modification)**

In [None]:
bnb_config= BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.bfloat16
  )

tokenizer2 = AutoTokenizer.from_pretrained.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model2 = AutoModelForCausalLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",quantization_config=bnb_config,
                                             device_map={"":0})



In [None]:
tokenizer2.src_lang = "hi_IN"

In [None]:
def toEnglish(en):
    encoded_hi = tokenizer2(en, return_tensors="pt")
    generated_tokens = model2.generate(**encoded_hi,forced_bos_token_id=tokenizer2.lang_code_to_id["en_XX"])
    return tokenizer2.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [None]:
welcome = ['hi', 'hey']

def chatbot_response_hindiFull(test):
    # If user inputs any of the greeting words, give greeting in response
    test = toEnglish(test)
    if len(nltk.word_tokenize(test)) >20:
        test=summarize_input(test)
    tfidf_test=tfidf_fit.transform([test])
    mask=tfidf_test.toarray()!=0
    m=mask[0]
    tfidf_test.toarray()[mask]
    cm=cosine_similarity(tfidf_test, tfidf_corpus)
    pos=np.argmax(cm[0])
    data.iloc[pos]
    return toHindi(data.answers[pos])

In [None]:
# # Remove comments to run chatbot
# print("FARM BOT: हाय!! बाहर निकलने के लिए बाई टाइप करें. मुझे कुछ भी पूछें: ")
# while(True):
#     user_chat = input()
#     if(user_chat.lower()=="bye"):
#         print("Bye")
#         break
#     print("Farm BOT: ", end=" ")
#     print(chatbot_response_hindiFull(user_chat))