## importin libraries

In [47]:
import pandas as pd
import os
import pinecone

from langchain.vectorstores import Pinecone

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import PromptTemplate

import google.generativeai as genai
from dotenv import find_dotenv, load_dotenv
from pinecone import Pinecone as pc

from tqdm import tqdm

## data cleaning

In [48]:
df = pd.read_csv('Coursera.csv')
df.head()

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


In [49]:
df['Course Name'].mode()[0]

'Google Cloud Platform Fundamentals: Core Infrastructure'

In [50]:
df = df.drop_duplicates()

In [51]:
df[df['Course Name']=='Google Cloud Platform Fundamentals: Core Infrastructure']

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
3325,Google Cloud Platform Fundamentals: Core Infra...,Google Cloud,Conversant,4.7,https://www.coursera.org/learn/gcp-fundamentals,This course introduces you to important concep...,Google Cloud Platform Big Data Cloud Infrast...


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3424 entries, 0 to 3521
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Course Name         3424 non-null   object
 1   University          3424 non-null   object
 2   Difficulty Level    3424 non-null   object
 3   Course Rating       3424 non-null   object
 4   Course URL          3424 non-null   object
 5   Course Description  3424 non-null   object
 6   Skills              3424 non-null   object
dtypes: object(7)
memory usage: 214.0+ KB


In [53]:
df.describe()

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
count,3424,3424,3424,3424.0,3424,3424,3424
unique,3416,184,5,31.0,3424,3397,3424
top,Corporate Strategy,Coursera Project Network,Beginner,4.7,https://www.coursera.org/learn/write-a-feature...,This specialization offers the latest developm...,Drama Comedy peering screenwriting film D...
freq,2,562,1406,707.0,1,3,1


In [54]:
df['combined'] = df[df.columns[0:]].apply(
    lambda x: ''.join(f'''The Name of the Course: "{x['Course Name']}". The University or Industry Partner that offers the Course: "{x['University']}". The Difficulty Level of the Course: "{x['Difficulty Level']}". The Rating of the Course: "{x['Course Rating']}". The Link or URL of the Course: "{x['Course URL']}". The Description of the Course: "{x['Course Description']}". The Skill Tags associated with the Course: [{x['Skills']}]'''),
    axis=1)
data = df['combined']
data

0       The Name of the Course: "Write A Feature Lengt...
1       The Name of the Course: "Business Strategy: Bu...
2       The Name of the Course: "Silicon Thin Film Sol...
3       The Name of the Course: "Finance for Managers"...
4       The Name of the Course: "Retrieve Data using S...
                              ...                        
3517    The Name of the Course: "Capstone: Retrieving,...
3518    The Name of the Course: "Patrick Henry: Forgot...
3519    The Name of the Course: "Business intelligence...
3520    The Name of the Course: "Rigid Body Dynamics"....
3521    The Name of the Course: "Architecting with Goo...
Name: combined, Length: 3424, dtype: object

In [55]:
data[0]

'The Name of the Course: "Write A Feature Length Screenplay For Film Or Television". The University or Industry Partner that offers the Course: "Michigan State University". The Difficulty Level of the Course: "Beginner". The Rating of the Course: "4.8". The Link or URL of the Course: "https://www.coursera.org/learn/write-a-feature-length-screenplay-for-film-or-television". The Description of the Course: "Write a Full Length Feature Film Script  In this course, you will write a complete, feature-length screenplay for film or television, be it a serious drama or romantic comedy or anything in between. You�ll learn to break down the creative process into components, and you�ll discover a structured process that allows you to produce a polished and pitch-ready script by the end of the course. Completing this project will increase your confidence in your ideas and abilities, and you�ll feel prepared to pitch your first script and get started on your next. This is a course designed to tap in

## loading apis

In [56]:
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
pinecone_enviroment = 'gcp-starter'

In [57]:
pc = pc(api_key=PINECONE_API_KEY)
index = pc.Index("mychatpot")

#### the line below will clear the vector database, be cautious

In [58]:
# index.delete(delete_all=True, namespace='')

In [59]:
model = 'models/embedding-001'
embed = GoogleGenerativeAIEmbeddings(google_api_key=GOOGLE_API_KEY,model=model)

## upserting the data into the vector database

In [61]:
index_name = 'mychatpot'
vstore = Pinecone.from_texts(texts=data, embedding=embed, index_name=index_name)

## check if the Vector Database works

In [62]:
vector = embed.embed_query("data engineering")
result = index.query(
    vector=[vector],
    top_k=10,
    # include_values=True,
    include_metadata=True
)
result.matches

[{'id': 'f9f48a13-28e0-4172-a398-64c1add28ae5',
  'metadata': {'text': 'The Name of the Course: "What is Data Science?". The '
                       'University or Industry Partner that offers the Course: '
                       '"IBM". The Difficulty Level of the Course: '
                       '"Conversant". The Rating of the Course: "4.6". The Link '
                       'or URL of the Course: '
                       '"https://www.coursera.org/learn/what-is-datascience". '
                       'The Description of the Course: "The art of uncovering '
                       'the insights and trends in data has been around since '
                       'ancient times. The ancient Egyptians used census data '
                       'to increase efficiency in tax collection and they '
                       'accurately predicted the flooding of the Nile river '
                       'every year. Since then, people working in data science '
                       'have carved ou