# Download the dataset from kaggle and unzip it

In [1]:
# !kaggle datasets download -d bhavesh907/crop-classificationcs2292017usgscroplanddata

In [2]:
# !mv ./crop-classificationcs2292017usgscroplanddata.zip ./Dataset

In [3]:
# !7z x ./Dataset/crop-classificationcs2292017usgscroplanddata.zip 

In [4]:
import numpy as np
from osgeo import gdal, gdal_array
import pandas as pd
import operator
import gc
import os

roi_ds = gdal.Open('./Dataset/cdl2017.tiff', gdal.GA_ReadOnly)

#train_ds = gdal.Open('D:/neurafarms/downloaded_sat_images/rose_mlready/0306.tiff', gdal.GA_ReadOnly)

roi = roi_ds.GetRasterBand(1).ReadAsArray()

# How many pixels are in each class?
classes = np.unique(roi)


# Iterate over all class labels in the ROI image, printing out some information
# for c in classes:
#     print('Class {c} contains {n} pixels'.format(c=c,
#                                                  n=(roi == c).sum()))

In [5]:
dict = {}
for c in classes:
    dict[c] = (roi == c).sum()
sorted_x = sorted(dict.items(), key=operator.itemgetter(1))
print("Top 6 classes and pixel counts \n",sorted_x[-6:])

Top 6 classes and pixel counts 
 [(225, 617040), (121, 766044), (36, 1543068), (75, 4729104), (69, 8311968), (255, 9381144)]


In [6]:
#Select top  5 classes exclude 255 class label

top_classes = [69,75,36,121,225]

In [7]:
# Find how many non-zero entries we have -- i.e. how many training data samples?
n_samples = (roi > 0).sum()
print('We have {n} samples'.format(n=n_samples))

# What are our classification labels?
labels = np.unique(roi[roi > 0])
print('The training data include {n} classes: {classes}'.format(n=labels.size, 
                                                                classes=labels))
# We will need a "X" matrix containing our features, and a "y" array containing our labels
#     These will have n_samples rows
#     In other languages we would need to allocate these and them loop to fill them, but NumPy can be faster

#X = img_b1[roi > 0, :]  
y = roi[roi > 0]

We have 28406694 samples
The training data include 55 classes: [  1   2   4  21  23  24  28  33  36  37  42  48  49  54  57  59  61  66
  67  69  71  72  75  76  77 111 121 122 123 124 131 152 176 190 195 204
 205 206 208 209 212 213 216 217 218 220 224 225 226 227 236 237 238 242
 255]


In [8]:
images = ['./Dataset/20170306.tiff',
          './Dataset/20170410.tiff',
          './Dataset/20170601.tiff',
          './Dataset/20170615.tiff',
          './Dataset/20170708.tiff',
          './Dataset/20170807.tiff',
          './Dataset/20170905.tiff',
          './Dataset/20170923.tiff',
          './Dataset/20171015.tiff',
          './Dataset/20171207.tiff']

In [9]:
#69,75,36,121,225
print("Reading class 69")
final = pd.DataFrame()

for c in top_classes:
    
    temp = pd.DataFrame()
    
    print(c)
    
    for img in images:

        print(img)

        train_ds = gdal.Open(img, gdal.GA_ReadOnly)

        print(train_ds.RasterXSize,train_ds.RasterYSize)

        img_b1 = np.zeros((train_ds.RasterYSize, train_ds.RasterXSize, train_ds.RasterCount),
                       gdal_array.GDALTypeCodeToNumericTypeCode(train_ds.GetRasterBand(1).DataType))
        
        for b in range(img_b1.shape[2]):
            img_b1[:, :, b] = train_ds.GetRasterBand(b + 1).ReadAsArray()
        
        print(img_b1.shape)


        Xt = img_b1[roi==c, :] 
        
        Xt1 = pd.DataFrame(Xt)
        
        Xt2 = Xt1.sample(n=100000)
        
        Xt2.reset_index(drop=True,inplace=True)
        
        temp = pd.concat([Xt2,temp],axis=1)
        
        temp["class"] = c
        #temp.reset_index(drop=True,inplace=True)
      
    final = pd.concat([temp,final],axis=0)
    final.reset_index(drop=True,inplace=True)
    
    gc.collect()

Reading class 69
69
./Dataset/20170306.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170410.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170601.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170615.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170708.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170807.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170905.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170923.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20171015.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20171207.tiff
5718 4969
(4969, 5718, 5)
75
./Dataset/20170306.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170410.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170601.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170615.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170708.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170807.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170905.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20170923.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20171015.tiff
5718 4969
(4969, 5718, 5)
./Dataset/20171207.tiff
571

In [10]:
final

Unnamed: 0,0,1,2,3,4,0.1,1.1,2.1,3.1,4.1,...,1.2,2.2,3.2,4.2,0.2,1.3,2.3,3.3,4.3,class
0,4077.0,3801.0,3588.0,3341.0,3509.0,9578.0,10052.0,10280.0,8968.0,8832.0,...,4815.0,2918.0,4605.0,12363.0,5228.0,4855.0,5013.0,5556.0,6514.0,225
1,3400.0,3065.0,1710.0,3169.0,7538.0,7435.0,7382.0,7149.0,6712.0,6667.0,...,4802.0,2935.0,4860.0,11249.0,4058.0,3586.0,1835.0,4208.0,9019.0,225
2,4518.0,3882.0,3839.0,3623.0,3788.0,6678.0,5938.0,6144.0,5540.0,5463.0,...,9226.0,8795.0,8145.0,8119.0,4128.0,3423.0,1559.0,3836.0,12437.0,225
3,4078.0,3398.0,3342.0,2988.0,3025.0,5294.0,4518.0,3827.0,4085.0,4035.0,...,5541.0,3926.0,5093.0,8735.0,4712.0,4614.0,2746.0,4830.0,8588.0,225
4,4403.0,3869.0,3487.0,3553.0,4081.0,6378.0,5924.0,6098.0,5600.0,5386.0,...,8122.0,7147.0,6846.0,7443.0,4279.0,3817.0,2050.0,4205.0,12931.0,225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,4266.0,3841.0,3807.0,3492.0,3795.0,5265.0,4319.0,3497.0,3859.0,5019.0,...,5763.0,4397.0,5285.0,7629.0,6406.0,6160.0,6222.0,5642.0,5474.0,69
499996,3974.0,3717.0,2986.0,3403.0,4611.0,7109.0,6415.0,6497.0,5964.0,6346.0,...,7474.0,6813.0,6977.0,7597.0,5239.0,4760.0,4163.0,4797.0,5710.0,69
499997,4200.0,3589.0,3595.0,3508.0,3776.0,7275.0,7232.0,7675.0,7041.0,7157.0,...,8149.0,7354.0,7577.0,8455.0,6876.0,6585.0,6435.0,5889.0,5727.0,69
499998,4183.0,3463.0,3674.0,3451.0,3615.0,6829.0,6644.0,6918.0,6959.0,7425.0,...,8731.0,7728.0,7720.0,8194.0,5954.0,5531.0,4750.0,5113.0,5786.0,69


In [11]:
final.columns = ['col_'+str(i) for i in range(51)]

In [12]:
final.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_41,col_42,col_43,col_44,col_45,col_46,col_47,col_48,col_49,col_50
0,4077.0,3801.0,3588.0,3341.0,3509.0,9578.0,10052.0,10280.0,8968.0,8832.0,...,4815.0,2918.0,4605.0,12363.0,5228.0,4855.0,5013.0,5556.0,6514.0,225
1,3400.0,3065.0,1710.0,3169.0,7538.0,7435.0,7382.0,7149.0,6712.0,6667.0,...,4802.0,2935.0,4860.0,11249.0,4058.0,3586.0,1835.0,4208.0,9019.0,225
2,4518.0,3882.0,3839.0,3623.0,3788.0,6678.0,5938.0,6144.0,5540.0,5463.0,...,9226.0,8795.0,8145.0,8119.0,4128.0,3423.0,1559.0,3836.0,12437.0,225
3,4078.0,3398.0,3342.0,2988.0,3025.0,5294.0,4518.0,3827.0,4085.0,4035.0,...,5541.0,3926.0,5093.0,8735.0,4712.0,4614.0,2746.0,4830.0,8588.0,225
4,4403.0,3869.0,3487.0,3553.0,4081.0,6378.0,5924.0,6098.0,5600.0,5386.0,...,8122.0,7147.0,6846.0,7443.0,4279.0,3817.0,2050.0,4205.0,12931.0,225


In [14]:
final.to_csv("./Dataset/final.csv",index=False)