### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [36]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
set_config(display="diagram")


import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

In [37]:
cali = pd.read_csv('data/cal_house_price.csv')

In [38]:
cali.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [39]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Load the dataset
# Load the dataset
iris = load_iris()
X, y = iris.data, iris.target
# Create a logistic regression classifier
# We use a pipeline to ensure each fold in the cross-validation is treated independently
logreg_pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=200, solver='liblinear'))

# Perform 5-fold cross-validation
# It returns an array of scores of the estimator for each run of the cross-validation.
scores = cross_val_score(logreg_pipeline, X, y, scoring='accuracy', cv=5)

# Output the results
print("Accuracy scores for each fold:")
print(scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())

# If you need to see the model with the best cross-validation score:
# Train the model on the entire dataset
logreg_pipeline.fit(X, y)

Accuracy scores for each fold:
[0.83333333 0.93333333 0.93333333 0.83333333 1.        ]
Mean accuracy: 0.9066666666666666
Standard deviation of accuracy: 0.06463573143221771


In [40]:
scores = cross_val_score(logreg_pipeline, X, y, scoring='accuracy', cv=10)

# Output the results
print("Accuracy scores for each fold:")
print(scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())

# If you need to see the model with the best cross-validation score:
# Train the model on the entire dataset
logreg_pipeline.fit(X, y)

Accuracy scores for each fold:
[0.8        0.86666667 1.         0.86666667 0.93333333 0.93333333
 0.8        0.86666667 0.93333333 1.        ]
Mean accuracy: 0.9
Standard deviation of accuracy: 0.0683130051063973


In [41]:
from sklearn.model_selection import LeaveOneOut, cross_val_score
loo = LeaveOneOut()

scores = cross_val_score(logreg_pipeline, X, y, cv = loo)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Standard deviation of accuracy:", scores.std())

Cross Validation Scores:  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
Average CV Score:  0.8933333333333333
Standard deviation of accuracy: 0.308688984074406


In [33]:
# We can now quickly sample a training set while holding out 40% of the data for testing (evaluating) our classifier:

In [35]:
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)


0.9666666666666667