In [None]:
import seaborn as sns
tips = sns.load_dataset('tips')
display(tips)


In [None]:
import matplotlib.pyplot as plt

# use total_bill as the feature and tip as the target
X = tips[['total_bill']]
y = tips['tip']

# Visualize the data
plt.figure(figsize=(8, 6))
plt.scatter(X, y, color = 'black')
plt.title('Linear Regression Example (Tips Dataset)')
plt.xlabel('Total Bill ($)')
plt.ylabel('Tip ($)')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)


In [None]:
from sklearn.linear_model import LinearRegression

# create a linear regression model
linear_model = LinearRegression()

# train the model on the training set
linear_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error

# make predictions on the test set
y_pred = linear_model.predict(X_test)

# calculate the Mean Squared Error (MSE) on the test set
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


In [None]:
# Visualize the regression line
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color = 'black')
plt.plot(X_test, y_pred, color = 'blue', linewidth = 3)
plt.title('Linear Regression Example (Tips Dataset)')
plt.xlabel('Total Bill ($)')
plt.ylabel('Tip ($)')
plt.show()


In [None]:
import pandas as pd
# making predictions
test_data = pd.DataFrame({'total_bill': [35]})
linear_model.predict(X=test_data)


In [None]:
import seaborn as sns
df = sns.load_dataset('mpg')
display(df)


In [None]:
# create a new column indicating if the car is from USA
df['is_usa'] = (df['origin'] == 'usa').astype(int)
display(df)


In [None]:
# select weight as the feature and y as the target
X = df[['weight']]
y = df['is_usa']

# visualize the regression line
plt.figure(figsize=(8, 6))
plt.scatter(X, y, color = 'black')
plt.title('Scatter plot of weights and is_usa')
plt.xlabel('Weight')
plt.ylabel('From USA?')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 42)


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

# train the model
model.fit(X_train, y_train)


In [None]:
# making predictions on the test set
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

# calculate accuracy and display confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)


In [None]:
import seaborn as sns

mpg = sns.load_dataset("mpg")
sns.regplot(x = mpg["weight"], 
            y = mpg["origin"].eq("usa").rename("from_usa"), 
            scatter_kws = {"color": "blue"}, 
            line_kws = {"color": "red"},
            logistic = True)


In [None]:
import pandas as pd

# making predictions
test_data = pd.DataFrame({'weight': [3500, 2200]})
print(model.predict(test_data))         # get predictions
print(model.predict_proba(test_data))   # get probabilities for predictions


In [None]:
import pandas as pd
import seaborn as sns

df = pd.read_csv('prices.csv')
sns.lmplot(x = 'floor_area', 
           y = 'rental_price', 
           data = df, 
           hue = 'rented',
           palette = 'Set2', 
           fit_reg = False, 
           scatter_kws = {"s": 50}) # size of circle


In [None]:
import numpy as np
X = df[['floor_area','rental_price']].values
y = np.where(df['rented']=='y', 1, 0)    #---1 for y and 0 for n---


In [None]:
from sklearn import svm
model = svm.SVC(kernel='linear').fit(X, y)


In [None]:
import matplotlib.pyplot as plt

#---min and max for the first feature---
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 

#---min and max for the second feature---
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1  

#---step size in the mesh---
h = (x_max / x_min) / 20 

#---make predictions for each of the points in xx,yy---
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 
                     np.arange(y_min, y_max, h))

Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

#---draw the result using a color plot---
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues, alpha=0.3)
    
plt.xlabel('Floor area of house (sq. ft.)')
plt.ylabel('Rental price')
plt.title("Floor area of houses and their rental Prices")


In [None]:
def will_it_rent(floor_area, rental_price):
    if(model.predict([[floor_area, rental_price]])) == 0:
        print('Will not rent!')
    else:
        print('Will rent!')

#---do some predictions---
will_it_rent(1000, 4000)  # Will not rent!
will_it_rent(940, 2200)   # Will rent!


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# simulate a sample dataset
np.random.seed(42)
size_of_data = 500

data = {
    'TotalPurchase': np.random.normal(loc= 1000, 
                                      scale = 300, 
                                      size = size_of_data),
    'TimeInShop': np.random.normal(loc = 30, 
                                   scale = 2, 
                                   size = size_of_data)
}
df = pd.DataFrame(data)

# scatter plot for TotalPurchase vs. TimeInShop, colored by cluster labels
plt.scatter(df['TotalPurchase'], 
            df['TimeInShop'], 
            edgecolor = 'k', 
            s = 50)
plt.xlabel('Total Purchase Amount')
plt.ylabel('Time in Shop (minutes)')
plt.title('Customers total purchases and time spent in shop')


In [None]:
from sklearn.preprocessing import StandardScaler

# standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)


In [None]:
from sklearn.cluster import KMeans
# apply K-Means with k=4 to identify four customer segments
kmeans = KMeans(n_clusters = 4, 
                n_init = 'auto', 
                random_state = 42)
kmeans.fit(X_scaled)


In [None]:
# add the cluster labels to the df
df['Cluster'] = kmeans.labels_
display(df)


In [None]:
# visualize the clusters
plt.figure(figsize=(12, 8))

# scatter plot for TotalPurchase vs. TimeInShop, colored by cluster labels
plt.scatter(df['TotalPurchase'], 
            df['TimeInShop'], 
            c = df['Cluster'], 
            cmap = 'viridis', 
            edgecolor = 'k', 
            s = 50)

# add labels
plt.xlabel('Total Purchase Amount')
plt.ylabel('Time in Shop (minutes)')
plt.title('Customer Segmentation with K-Means Clustering')


In [None]:
new_data = pd.DataFrame([[2000,  26]], 
                        columns = ['TotalPurchase','TimeInShop'])
new_data_scaled = scaler.transform(new_data)
kmeans.predict(new_data_scaled)


In [None]:
!pip install gensim
!pip install nltk


In [None]:
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')
display(df)


In [None]:
corpus = df['review']

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# tokenize sentences into words
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

# train the Word2Vec model
model = Word2Vec(sentences = tokenized_corpus, 
                 vector_size = 100, 
                 window = 5, 
                 min_count = 1, 
                 workers = 4)

# save the model to a file
model.save("word2vec.model")


In [None]:
# examine the first 10 tokens
tokenized_corpus[0][:10]


In [None]:
# load the trained model
loaded_model = Word2Vec.load("word2vec.model")

word = 'lousy'
# get the vector representation of a word
vector_representation = loaded_model.wv[word]
print(f"Vector representation of '{word}':", vector_representation)

# find similar words
similar_words = loaded_model.wv.most_similar(word, topn=3)
print(f"Most similar words to '{word}':", similar_words)


In [None]:
vocabulary = loaded_model.wv.key_to_index
print("Vocabulary:", list(vocabulary.keys()))


In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# original data
plt.scatter(X[:, 0], X[:, 1], 
            c = y,
            cmap = 'viridis', 
            edgecolor = 'k', 
            s = 50)

plt.title('Original Data')
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')

# plot the legend
cmap = plt.get_cmap('viridis')
norm = plt.Normalize(y.min(), y.max())
handles = [plt.Line2D([0, 0], [0, 0], 
                      color = cmap(norm(i)), 
                      marker = 'o', 
                      linestyle = '', 
                      label = target)
           for i, target in enumerate(iris.target_names)]
plt.legend(handles = handles, 
           title = 'Species')


In [None]:
from sklearn.preprocessing import StandardScaler
# standardize the data with mean = 0 and variance = 1
X_standardized = StandardScaler().fit_transform(X)


In [None]:
from sklearn.decomposition import PCA

# apply PCA to reduce the data to 2 dimensions
pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X_standardized)


In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], 
            c = y, 
            cmap = 'viridis', 
            edgecolor = 'k', 
            s = 50)
plt.title('Reduced Data (2-dimension PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(handles = handles, 
           title = 'Species')


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

_sc = StandardScaler()
_pca = PCA(n_components = 2)
_model = LogisticRegression()

model = Pipeline([
    ('std_scaler', _sc),
    ('pca', _pca),
    ('regressor', _model)
])


In [None]:
# perform a split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size = 0.3,
                     shuffle = True, 
                     random_state = 42)

# train the model
model.fit(X_train,y_train)


In [None]:
model.score(X_test,y_test)

In [None]:
import pandas as pd
test_data = pd.DataFrame([[6.0, 2.9, 4.5, 1.0]])   # sepal length 
                                                   # sepal width
                                                   # petal length
                                                   # petal width
y_pred = model.predict(test_data)
iris.target_names[y_pred[0]]
